Source code for lena.structures.split_into_bins

"""Split analysis into groups defined by bins."""
import copy
import itertools

import lena.context
import lena.core
import lena.flow
import lena.math
import lena.structures
import lena.variables
from .hist_functions import (
    cell_to_string, get_example_bin, iter_bins_with_edges
)


class _MdSeqMap(object):
    """Multidimensional mapping of a *Sequence*."""

    def __init__(self, generator, array):
        """*generator* is mapped to *array*'s contents.
        Example when a bin is a sequence:
        ``generator=lambda cell: cell.compute()``.
        """
        self._generators = lena.math.md_map(generator, array)
        # self._arr = arr
        #, self.bins)

    def next(self):
        # Python 2
        return lena.math.md_map(next, self._generators)

    def __next__(self):
        # Python 3
        return self.next()

    def __iter__(self):
        return self


[docs]class IterateBins(object): """Iterate bins of histograms.""" def __init__(self, create_edges_str=None, select_bins=None): """*create_edges_str* is a callable that creates a string from bin's edges and coordinate names and adds that to the context. It is passed parameters *(edges, var_context)*, where *var_context* is *variable* context containing variable names (it can be a single :class:`.Variable` or :class:`.Combine`). By default it is :func:`.cell_to_string`. *select_bins* is a callable used to test bin contents. By default, only those histograms are iterated where bins contain histograms. Use *select_bins* to choose other classes. See :class:`.Selector` for examples. If *create_edges_str* is not callable, :exc:`.LenaTypeError` is raised. """ if create_edges_str is None: # default create_edges_str = cell_to_string elif not callable(create_edges_str): raise lena.core.LenaTypeError( "create_edges_str must be callable, " "{} provided".format(create_edges_str) ) self._create_edges_str = create_edges_str # todo: create a selector for bins # (example bins or some iterable) if select_bins is None: # select if bins contain histograms self._select_bins = lena.flow.Selector(lena.structures.histogram) else: self._select_bins = lena.flow.Selector(select_bins)
[docs] def run(self, flow): """Yield histogram bins one by one. For each :class:`.histogram` from the *flow*, if its bins pass *select_bins*, they are iterated. The resulting context is taken from bin's context. Histogram's context is preserved in *context.bins*. *context.bin* is updated with "edges" (with bin edges) and "edges_str" (their representation). If histogram's context contains *variable*, that is used for edges' representation. Not histograms pass unchanged. """ update_nested = lena.context.update_nested for val in flow: data, hist_context = lena.flow.get_data_context(val) # select histograms if not isinstance(data, lena.structures.histogram): yield val continue # bins also contain histograms data00 = lena.flow.get_data(get_example_bin(data)) if not self._select_bins(data00): yield val continue for histc, bin_edges in iter_bins_with_edges(data.bins, data.edges): hist, bin_context = lena.flow.get_data_context(histc) # we assume that context.variable corresponds to # the variable used for bins. # If that is some old variable left in the context, # remove that first! Anyway SplitIntoBins would create # its correct variable (if called before this element). split_var_context = lena.context.get_recursively( hist_context, "variable", None ) # split_var_context is not modified here. edges_str = self._create_edges_str( bin_edges, var_context=split_var_context ) context_bin = {"edges": bin_edges, "edges_str": edges_str} # bin_context is main analysis context. # Update it with the surrounding histogram context. # It will have some duplication, but a complete copy # is needed, because it can be different in general. update_nested("bins", bin_context, copy.deepcopy(hist_context)) # Bin position details are in *context.bin* update_nested("bin", bin_context, context_bin) yield (hist, bin_context)
[docs]class MapBins(object): """Transform bin content of histograms. This class can be used when histogram bins contain complex structures. For example, in order to plot a histogram with a 3-dimensional vector in each bin, one can create 3 histograms corresponding to the vector's components. """ def __init__(self, seq, select_bins=lambda _: True, get_example_bin=get_example_bin, drop_bins_context=True): """*seq* is a sequence or an element applied to bin contents. If *seq* is not a :class:`.Sequence` or an element with *run* method, it is converted to a :class:`.Sequence`. Example: ``seq=Split([X(), Y(), Z()])`` (provided that you have X, Y, Z variables). If *select_bins* applied to histogram bins is ``True`` (tested on an arbitrary bin), the histogram is transformed. Bin types can be given in a ``list`` or as a general :class:`.Selector`. For example, ``select_bins=[lena.math.vector3, list]`` selects histograms where bins are vectors or lists. By default all histograms are accepted. The "arbitrary bin" is returned by a callable *get_example_bin* (by default :func:`.get_example_bin`). :class:`.MapBins` creates histograms that may be plotted, because their bins contain only data without context. If *drop_bins_context* is ``False``, context remains in bins. By default, context of all histogram bins is discarded. This discourages compositions of :class:`.MapBins`: make compositions of their internal sequences instead. In case of incorrect arguments, :exc:`.LenaTypeError` is raised. """ if not lena.core.is_run_el(seq): try: seq = lena.core.Sequence(seq) except lena.core.LenaTypeError: raise lena.core.LenaTypeError( "seq must be a Sequence or convertible to that, " "or an element with run method; " "{} provided".format(seq) ) self._seq = seq if not isinstance(select_bins, lena.flow.Selector): try: select_bins = lena.flow.Selector(select_bins) except lena.core.LenaTypeError: raise lena.core.LenaTypeError( "select_bins must be convertible to a Selector, " "{} given".format(select_bins) ) self._select_bins = select_bins assert callable(get_example_bin) self._get_example_bin = get_example_bin self._drop_bins_context = bool(drop_bins_context)
[docs] def run(self, flow): """Transform histograms from *flow*. *context.value* is updated with bin context (if that exists). It is assumed that all bins have the same context (because they were produced by the same sequence), therefore an arbitrary bin is taken and contexts of all other bins are ignored. Not selected values pass unchanged. """ get_example_bin = self._get_example_bin for val in flow: hist, context = lena.flow.get_data_context(val) update_nested = lena.context.update_nested # histograms are selected if not isinstance(hist, lena.structures.histogram): yield val continue # bin content is selected bin_ = get_example_bin(hist) if not self._select_bins(bin_): # no transformation needed yield val continue # bins are be transformed # Several iterations can happen, in principle. generators = _MdSeqMap( lambda cell: copy.deepcopy(self._seq).run([cell]), hist.bins ) for new_bins in generators: if self._drop_bins_context: new_data = lena.math.md_map(lena.flow.get_data, new_bins) else: # leave context in bins new_data = new_bins # deep copy, or make histogram.edges immutable edges = copy.deepcopy(hist.edges) new_hist = lena.structures.histogram(edges, new_data) ## update context new_context = copy.deepcopy(context) # no compositions of variables, because # there is no composition between edges' # and bins' transformations # (all common compositions are already counted) bin_context = copy.deepcopy( lena.flow.get_context(get_example_bin(new_bins)) ) # instead of having SIB context in histogram, # it moved it to the histogram in this element! # hist_context = sib_context.get("histogram", {}) # if hist_context: # lena.context.update_nested( # "histogram", context, copy.deepcopy(hist_context) # ) # we name it "value" and hope to have the same name # for graph values. if bin_context: update_nested("value", new_context, bin_context) # one might optimise copying of the context here, # but we leave it like this (because it had bugs) yield (new_hist, new_context)
[docs]class SplitIntoBins(): """Split analysis into groups defined by bins.""" def __init__(self, seq, arg_var, edges): """*seq* is a :class:`.FillComputeSeq` sequence (or will be converted to that) that corresponds to the analysis being performed for different bins. Deep copy of *seq* is done for each bin. *arg_var* is a :class:`.Variable` that takes data and returns value used to compute the bin index. Example of a two-dimensional function: ``arg_var = lena.variables.Variable("xy", lambda event: (event.x, event.y))``. *edges* is a sequence of arrays containing monotonically increasing bin edges along each dimension. Example: ``edges = lena.math.mesh((0, 1), 10)``. Note ---- The final histogram may contain vectors, histograms and any other data the analysis produced. To plot them, one can extract vector components with e.g. :class:`.MapBins`. If bin contents are histograms, they can be yielded one by one with :class:`.IterateBins`. **Attributes**: bins, edges. If *edges* are not increasing, :exc:`.LenaValueError` is raised. In case of other argument initialization problems, :exc:`.LenaTypeError` is raised. """ if not isinstance(seq, lena.core.FillComputeSeq): try: seq = lena.core.FillComputeSeq(seq) except lena.core.LenaTypeError: raise lena.core.LenaTypeError( "seq must contain a FillCompute element, " "{} provided".format(seq) ) if isinstance(arg_var, lena.variables.Variable): self._arg_var = arg_var self._arg_func = arg_var.getter else: raise lena.core.LenaTypeError( "arg_var must be a Variable, " "{} provided.".format(arg_var) ) # may raise LenaValueError lena.structures.check_edges_increasing(edges) self.bins = lena.structures.init_bins(edges, seq, deepcopy=True) self.edges = edges self._cur_context = {}
[docs] def fill(self, val): """Fill the cell corresponding to *arg_var(val)* with *val*. Values outside the :attr:`edges` are ignored. """ data, context = lena.flow.get_data_context(val) # deep copy, because internal sequences may modify context # This is inefficient, # because only context of the last value is needed # todo: create a method _fill_data, # and fill with context only for the last time context = copy.deepcopy(context) bin_index = lena.structures.get_bin_on_value(self._arg_func(data), self.edges) # we fill it manually, because histogram.fill does # subarr[ind] += weight subarr = self.bins for ind in bin_index: # underflow if ind < 0: return try: subarr = subarr[ind] # overflow except IndexError: return # subarr is now the cell self.edges[bin_index] subarr.fill(val) self._cur_context = context
[docs] def compute(self): """Yield a *(histogram, context)* pair for each *compute()* for all bins. The :class:`.histogram` is created from :attr:`edges` with bin contents taken from *compute()* for :attr:`bins`. Computational context is preserved in histogram's bins. :class:`.SplitIntoBins` adds context as a subcontext *variable* (corresponding to *arg_var*). This allows unification of :class:`.SplitIntoBins` with common analysis using variables (useful when creating plots from one template). Existing context values are preserved. Note ---- In Python 3 the minimum number of *compute()* among all bins is used. In Python 2, if some bin is exhausted before the others, its content will be filled with ``None``. """ # no copy, because cur_context is copied during fill() cur_context = self._cur_context # cur_context = copy.deepcopy(self._cur_context) # update context.variable self._arg_var._update_context(cur_context, copy.deepcopy(self._arg_var.var_context)) generators = _MdSeqMap(lambda cell: cell.compute(), self.bins) # generators = lena.math.md_map(lambda cell: cell.compute(), self.bins) while True: try: result = next(generators) except StopIteration: break # result = lena.math.md_map(next, generators) hist = lena.structures.histogram(self.edges, result) yield (hist, copy.deepcopy(cur_context))