"""Split analysis on groups set by bins."""
from __future__ import print_function
import copy
import lena.context
import lena.core
import lena.flow
import lena.math
import lena.structures
import lena.variables
def _iter_bins_with_edges(bins, edges):
"""Yield *(bin content, bin edges)* pairs.
*Bin edges* is a tuple, such that at index *i*
its element is bin's *(lower bound, upper bound)*,
on *i*-th the coordinate.
"""
if not isinstance(edges[0], list):
edges = [edges]
bins_sizes = [len(edge)-1 for edge in edges]
index = [0] * len(edges)
cur_ind = len(edges)-1
zeroth_bin_yielded = False
while cur_ind >= 0:
var_ind = 0
while var_ind < bins_sizes[cur_ind]:
index[cur_ind] = var_ind
bin_ = lena.structures.get_bin_on_index(index, bins)
edges_low = []
edges_high = []
for i, cur_var in enumerate(index):
edges_low.append(edges[i][cur_var])
edges_high.append(edges[i][cur_var+1])
if zeroth_bin_yielded is False or var_ind != 0:
yield (bin_, tuple(zip(edges_low, edges_high)))
# yield (bin_, (edges_low, edges_high))
zeroth_bin_yielded = True
var_ind += 1
cur_ind -= 1
class _MdSeqMap(object):
"""Multidimensional mapping of a *Sequence*."""
def __init__(self, generator, array):
"""*generator* is mapped to *array*'s contents.
Example when a bin is a sequence:
``generator=lambda cell: cell.compute()``.
"""
self._generators = lena.math.md_map(generator, array)
# self._arr = arr
#, self.bins)
def next(self):
# Python 2
return lena.math.md_map(next, self._generators)
def __next__(self):
# Python 3
return self.next()
def __iter__(self):
return self
[документация]def cell_to_string(cell_edges, var_context=None, coord_names=None,
coord_fmt="{}_lte_{}_lt_{}", coord_join="_", reverse=False):
"""Transform cell edges into a string.
*cell_edges* is a tuple of pairs *(lower bound, upper bound)*
for each coordinate.
*coord_names* is a list of coordinates names.
*coord_fmt* is a string,
which defines how to format individual coordinates.
*coord_join* is a string, which joins coordinate pairs.
If *reverse* is True, coordinates are joined in reverse order.
"""
if coord_names is None:
if var_context is None:
coord_names = ["coord{}".format(ind) for ind in range(len(cell_edges))]
else:
if "combine" in var_context:
coord_names = [var["name"]
for var in var_context["combine"]]
else:
coord_names = [var_context["name"]]
if len(cell_edges) != len(coord_names):
raise lena.core.LenaValueError(
"coord_names must have same lenght as cell_edges, "
"{} and {} given".format(coord_names, cell_edges)
)
coord_strings = [coord_fmt.format(edge[0], coord_names[ind], edge[1])
for (ind, edge) in enumerate(cell_edges)]
if reverse:
coord_strings = reversed(coord_strings)
coord_str = coord_join.join(coord_strings)
return coord_str
[документация]def get_example_bin(struct):
"""Return bin with zero index on each axis of the histogram bins.
For example, if the histogram is two-dimensional, return hist[0][0].
*struct* can be a :class:`~lena.structures.Histogram`
or an array of bins.
"""
if isinstance(struct, lena.structures.Histogram):
return lena.structures.get_bin_on_index([0] * struct.dim, struct.bins)
else:
bins = struct
while isinstance(bins, list):
bins = bins[0]
return bins
[документация]class ReduceBinContent(object):
"""Transform bin content of histograms.
This class is used when histogram bins contain complex structures.
For example, in order to plot a histogram
with a 3-dimensional vector in each bin,
we shall create 3 histograms corresponding to vector's components.
"""
def __init__(self, select, transform, drop_bins_context=True):
"""*Select* determines which types should be transformed.
The types must be given in a ``list`` (not a tuple)
or as a general :class:`Selector`.
Example: ``select=[lena.math.vector3, list]``.
*transform* is a *Sequence* or element applied to bin contents.
If *transform* is not a :class:`~lena.core.Sequence`
or an element with *run* method, it is converted to a
:class:`~lena.core.Sequence`.
Example: ``transform=Split([X(), Y(), Z()])``
(provided that you have X, Y, Z variables).
:class:`ReduceBinContent` creates histograms,
which may be plotted, that is bins contain only data
without context.
By default, context of all bins except one is not used.
If *drop_bins_context* is ``False``, a histogram of
bin context is added to context.
In case of wrong arguments,
:exc:`~lena.core.LenaTypeError` is raised.
"""
if not isinstance(select, lena.flow.Selector):
try:
select = lena.flow.Selector(select)
except lena.core.LenaTypeError:
raise lena.core.LenaTypeError(
"select must be convertible to a Selector, "
"{} given".format(select)
)
self._selector = select
if not lena.core.is_run_el(transform):
try:
transform = lena.core.Sequence(transform)
except lena.core.LenaTypeError:
raise lena.core.LenaTypeError(
"transform must be a Sequence or convertible to that, "
"or an element with run method; "
"{} provided".format(transform)
)
self._transform = transform
self._drop_bins_context = bool(drop_bins_context)
[документация] def run(self, flow):
"""Transform histograms from *flow*.
Not selected values pass unchanged.
Context is updated with *variable*, *histogram*
and *bin_content*.
*variable" and *histogram* copy context from *split_into_bins*
(if present there).
*bin_content* includes context for example bin in "example_bin"
and (optionally) for all bins in "all_bins".
"""
for value in flow:
hist, context = lena.flow.get_data_context(value)
# data part must be a histogram
if not isinstance(hist, lena.structures.Histogram):
yield value
continue
val = get_example_bin(hist)
# value must be selected
## types are checked against data part of the bin
if not self._selector(val):
# no transformation needed
yield value
continue
# bins should be transformed.
# Several iterations can happen, in principle.
generators = _MdSeqMap(
lambda cell: copy.deepcopy(self._transform).run([cell]), hist.bins
)
for new_bins in generators:
new_data = lena.math.md_map(lena.flow.get_data, new_bins)
ana_context = copy.deepcopy(
lena.flow.get_context(get_example_bin(new_bins))
)
cur_bin_context = {"bin_content": {"example_bin": ana_context}}
if not self._drop_bins_context:
all_new_context = lena.math.md_map(
lena.flow.get_context, new_bins
)
cur_bin_context["bin_content"]["all_bins"] = all_new_context
sib_context = context.get("split_into_bins", {})
var_context = sib_context.get("variable", {})
hist_context = sib_context.get("histogram", {})
if var_context:
lena.context.update_nested(context, {"variable": var_context})
if hist_context:
lena.context.update_nested(context, {"histogram": hist_context})
lena.context.update_nested(context, cur_bin_context)
# or make Histogram.edges immutable
edges = copy.deepcopy(hist.edges)
new_hist = lena.structures.Histogram(edges, new_data)
yield (new_hist, context)
[документация]class SplitIntoBins(lena.core.FillCompute):
"""Split analysis into bins."""
def __init__(self, seq, arg_func, edges, transform=None):
"""*seq* is a :class:`~lena.core.FillComputeSeq` sequence,
which corresponds to the analysis being compared
for different bins.
It can be a tuple containing a *FillCompute* element.
Deep copy of *seq* will be used to produce each bin's content.
*arg_func* is a function which takes data
and returns argument value used to compute the bin index.
A :class:`~lena.variables.variable.Variable` must be provided.
Example of a two-dimensional function:
``arg_func = lena.variables.Variable("xy",
lambda event: (event.x, event.y))``.
*edges* is a sequence of arrays containing
monotonically increasing bin edges along each dimension.
Example: ``edges = lena.math.mesh((0, 1), 10)``.
*transform* is a :class:`~lena.core.Sequence`,
which is applied to results.
The final histogram may contain vectors, histograms and
any other data the analysis produced. To be able to plot them,
*transform* can extract vector components or do other work
to simplify structures.
By default, *transform* is :class:`TransformBins`.
Pass an empty tuple to disable it.
**Attributes**: bins, edges.
If *edges* are not increasing,
:exc:`~lena.core.exceptions.LenaValueError` is raised.
In case of other argument initialization problems,
:exc:`~lena.core.exceptions.LenaTypeError` is raised.
"""
if not isinstance(seq, lena.core.FillComputeSeq):
try:
seq = lena.core.FillComputeSeq(seq)
except lena.core.LenaTypeError:
raise lena.core.LenaTypeError(
"seq must contain a FillCompute element, "
"{} provided".format(seq)
)
if isinstance(arg_func, lena.variables.Variable):
self._arg_var = arg_func
self._arg_func = arg_func.getter
else:
raise lena.core.LenaTypeError(
"arg_func must be a Variable, "
"{} provided.".format(arg_func)
)
# may raise LenaValueError
lena.structures.check_edges_increasing(edges)
self.bins = lena.structures.init_bins(edges, seq, deepcopy=True)
self.edges = edges
if transform is None:
transform = TransformBins()
elif transform == ():
pass
elif not isinstance(transform, lena.core.Sequence):
try:
transform = lena.core.Sequence(transform)
except lena.core.LenaTypeError:
raise lena.core.LenaTypeError(
"transform must be convertible to Sequence, "
"{} provided".format(transform)
)
self.transform = transform
self._cur_context = {}
[документация] def fill(self, val):
"""Fill the cell corresponding to *arg_func(val)* with *val*.
Values outside of *edges* range are ignored.
"""
data, context = lena.flow.get_data_context(val)
bin_index = lena.structures.get_bin_on_value(self._arg_func(data),
self.edges)
subarr = self.bins
for ind in bin_index:
# underflow
if ind < 0:
return
try:
subarr = subarr[ind]
# overflow
except IndexError:
return
# subarr is now the cell self.edges[bin_index]
subarr.fill(val)
self._cur_context = context
[документация] def compute(self):
"""Yield a *(Histogram, context)* for *compute()* for each bin.
:class:`~lena.structures.Histogram`
is created from :attr:`edges`
and bins taken from compute() for :attr:`bins`.
Context is preserved in histogram bins.
:class:`SplitIntoBins` context is added
to *context.split_into_bins* as *histogram*
(corresponding to *edges*) and *variable*
(corresponding to *arg_func*) subcontexts.
In Python 3 the minimum number of *compute()*
among all bins is used.
In Python 2, if some bin is exhausted before the others,
its content will be filled with None.
"""
# cur_context is shared with some inner sequences
cur_context = copy.deepcopy(self._cur_context)
generators = _MdSeqMap(lambda cell: cell.compute(), self.bins)
# generators = lena.math.md_map(lambda cell: cell.compute(), self.bins)
while True:
try:
result = next(generators)
except StopIteration:
break
# result = lena.math.md_map(next, generators)
hist = lena.structures.Histogram(self.edges, result)
old_sib = cur_context.pop("split_into_bins", {})
if old_sib:
# nest previous split_into_bins
cur_context["split_into_bins"] = {"split_into_bins": old_sib}
else:
cur_context["split_into_bins"] = {}
sib_context = cur_context["split_into_bins"]
# todo. improve consistency below
var_context = copy.deepcopy({"variable": self._arg_var.var_context})
hist_context = copy.deepcopy(hist._hist_context)
sib_context.update(var_context)
sib_context.update(hist_context)
if self.transform:
results = self.transform.run([(hist, cur_context)])
for result in results:
yield result
else:
yield (hist, cur_context)