"""Data generators."""
__author__ = ["Tveten"]
import numpy as np
import pandas as pd
import scipy.stats
from ..utils.validation.generation import check_random_generator, check_segment_lengths
from ._utils import recycle_list
def _check_distributions(
distributions: (
scipy.stats.rv_continuous
| scipy.stats.rv_discrete
| list[scipy.stats.rv_continuous]
| list[scipy.stats.rv_discrete]
| None
),
n_segments: int,
) -> tuple[list[scipy.stats.rv_continuous | scipy.stats.rv_discrete], int, np.dtype]:
"""Check if distributions are valid and return as a list.
Parameters
----------
distributions : list of `scipy.stats.rv_continuous` or `scipy.stats.rv_discrete`
List of distributions for each segment.
n_segments : int
Number of segments to generate. Used to check if the number of distributions
matches the number of segments.
Returns
-------
list[scipy.stats.rv_continuous | scipy.stats.rv_discrete]
List of distributions for each segment, where each distribution is guarnteed
to have a `rvs(size: int, random_state: int | None)` method that returns
a numpy array or scalar of the same size, and the output size is either
1 or `p`.
int
Output size of the distributions, which is either 1 or `p`.
np.dtype
Data type of the output of the distributions.
"""
if distributions is None:
distributions = [scipy.stats.norm(), scipy.stats.norm(5)]
elif not isinstance(distributions, list):
distributions = [distributions]
if len(distributions) == 0:
raise ValueError("distributions cannot be an empty list.")
distributions = recycle_list(distributions, n_segments)
output_sizes = []
output_dtypes = []
for dist in distributions:
try:
output = dist.rvs(size=1)
output_sizes.append(output.size)
output_dtypes.append(output.dtype)
except Exception:
output_sizes.append(None)
if any(size is None for size in output_sizes):
raise ValueError(
"All distributions must support the 'rvs' method with a 'size' argument,"
" where the output is a numpy.array or numpy scalar."
" Ensure that all distributions are valid scipy.stats distributions."
)
if len(set(output_sizes)) > 1:
raise ValueError(
f"All distributions must produce samples with the same number of variables."
f" Got distribution.rvs(size=1).size outputs: {output_sizes}."
)
if len(set(output_dtypes)) > 1:
raise ValueError(
"All distributions must produce samples with the same data type."
f" Got distribution.rvs(size=1).dtype outputs: {output_dtypes}."
)
return distributions, output_sizes[0], output_dtypes[0]
[docs]
def generate_piecewise_data(
distributions: scipy.stats.rv_continuous
| scipy.stats.rv_discrete
| list[scipy.stats.rv_continuous]
| list[scipy.stats.rv_discrete]
| None = None,
lengths: int | list[int] | np.ndarray | None = None,
*,
n_segments: int = 3,
n_samples: int = 100,
seed: int | np.random.Generator | None = None,
return_params: bool = False,
) -> pd.DataFrame | tuple[pd.DataFrame, dict]:
"""Generate data with a piecewise constant distribution.
Generate piecewise segments of data from `scipy.stats` distributions, where
unspecified parameters are randomly generated.
Parameters
----------
distributions : list of `scipy.stats.rv_continuous` or `scipy.stats.rv_discrete`, optional (default=None)
The distributions for generating piecewise data.
They are recycled to match the number of segments specified by `lengths` or
`n_segments`.
If None, alternating segments of `scipy.stats.norm()` and `scipy.stats.norm(5)`
are used. Each distribution is expected to be a scipy distribution instance
(e.g., `scipy.stats.norm`, `scipy.stats.uniform`). See
`scipy.stats <https://docs.scipy.org/doc/scipy/reference/stats.html>`_
for a list of all available distributions.
lengths : int, list of int or np.ndarray, optional (default=None)
The segment lengths. There are three possible cases:
1. `list` or `numpy array`: Custom set of segment lengths.
2. `int`: Length of `n_segments` equal segments.
3. `None`: Generate `n_segments` random segment lengths with a total sample size
of `n_samples`.
n_segments : int (default=3)
Number of segments to generate if `lengths` is an integer or None.
n_samples : int (default=100)
Total number of samples to generate if `lengths` is not specified.
seed : np.random.Generator | int | None, optional
Seed for the random number generator or a numpy random generator instance.
If specified, this ensures reproducible output across multiple calls.
return_params : bool, optional (default=False)
If True, the function returns a tuple of the generated DataFrame and a
dictionary with the parameters used to generate the data.
Returns
-------
pd.DataFrame
Data frame with generated data.
dict, optional
A dictionary containing the parameters used to generate the data. Only returned
if `return_params` is True. It has the following keys:
* `"n_segments"` : number of segments generated.
* `"n_samples"` : total number of samples generated.
* `"distributions"` : list of `scipy.stats.rv_continuous` or
`scipy.stats.rv_discrete` with the distributions used for each segment.
* `"lengths"` : list of lengths for each segment.
* `"change_points"` : list of change points, which are the starting indices
of each segment in the data.
Examples
--------
>>> # Example 1: Two normal segments
>>> from skchange.datasets import generate_piecewise_data
>>> from scipy.stats import norm
>>> generate_piecewise_data(
... distributions=[norm(0, 1), norm(10, 0.1)],
... lengths=[7, 3],
... seed=1,
... )
0
0 0.345584
1 0.821618
2 0.330437
3 -1.303157
4 0.905356
5 0.446375
6 -0.536953
7 10.058112
8 10.036457
9 10.029413
>>> # Example 2: Two Poisson segments
>>> from scipy.stats import poisson
>>> generate_piecewise_data(
... distributions=[poisson(1), poisson(10)],
... lengths=[5, 5],
... seed=2,
... )
0
0 0
1 0
2 1
3 2
4 0
5 8
6 11
7 9
8 9
9 9
>>> # Example 3: Specify int lengths and n_segments
>>> generate_piecewise_data(
... distributions=[norm(0), norm(5)],
... lengths=3,
... n_segments=3,
... seed=3,
... )
0
0 2.040919
1 -2.555665
2 0.418099
3 4.432230
4 4.547351
5 4.784403
6 -2.019986
7 -0.231932
8 -0.865213
""" # noqa: E501
random_generator = check_random_generator(seed)
lengths = check_segment_lengths(
lengths, n_segments, n_samples, seed=random_generator
)
n_segments = len(lengths)
n_samples = np.sum(lengths)
distributions, n_variables, dtype = _check_distributions(distributions, n_segments)
ends = np.cumsum(lengths)
starts = np.concatenate(([0], ends[:-1]))
generated_values = np.empty((n_samples, n_variables), dtype=dtype)
for distribution, start, end in zip(distributions, starts, ends):
length = end - start
values = distribution.rvs(size=length, random_state=random_generator)
generated_values[start:end, :] = values.reshape(length, n_variables)
generated_df = pd.DataFrame(generated_values)
if return_params:
params = {
"n_segments": n_segments,
"n_samples": n_samples,
"lengths": lengths,
"distributions": distributions,
"change_points": starts[1:],
}
return generated_df, params
return generated_df