Source code for skchange.datasets._generate

"""Data generators."""

__author__ = ["Tveten"]

import numpy as np
import pandas as pd
import scipy.stats

from ..utils.validation.generation import check_random_generator, check_segment_lengths
from ._utils import recycle_list


def _check_distributions(
    distributions: (
        scipy.stats.rv_continuous
        | scipy.stats.rv_discrete
        | list[scipy.stats.rv_continuous]
        | list[scipy.stats.rv_discrete]
        | None
    ),
    n_segments: int,
) -> tuple[list[scipy.stats.rv_continuous | scipy.stats.rv_discrete], int, np.dtype]:
    """Check if distributions are valid and return as a list.

    Parameters
    ----------
    distributions : list of `scipy.stats.rv_continuous` or `scipy.stats.rv_discrete`
        List of distributions for each segment.

    n_segments : int
        Number of segments to generate. Used to check if the number of distributions
        matches the number of segments.

    Returns
    -------
    list[scipy.stats.rv_continuous | scipy.stats.rv_discrete]
        List of distributions for each segment, where each distribution is guarnteed
        to have a `rvs(size: int, random_state: int | None)` method that returns
        a numpy array or scalar of the same size, and the output size is either
        1 or `p`.
    int
        Output size of the distributions, which is either 1 or `p`.
    np.dtype
        Data type of the output of the distributions.
    """
    if distributions is None:
        distributions = [scipy.stats.norm(), scipy.stats.norm(5)]
    elif not isinstance(distributions, list):
        distributions = [distributions]

    if len(distributions) == 0:
        raise ValueError("distributions cannot be an empty list.")

    distributions = recycle_list(distributions, n_segments)

    output_sizes = []
    output_dtypes = []
    for dist in distributions:
        try:
            output = dist.rvs(size=1)
            output_sizes.append(output.size)
            output_dtypes.append(output.dtype)
        except Exception:
            output_sizes.append(None)

    if any(size is None for size in output_sizes):
        raise ValueError(
            "All distributions must support the 'rvs' method with a 'size' argument,"
            " where the output is a numpy.array or numpy scalar."
            " Ensure that all distributions are valid scipy.stats distributions."
        )

    if len(set(output_sizes)) > 1:
        raise ValueError(
            f"All distributions must produce samples with the same number of variables."
            f" Got distribution.rvs(size=1).size outputs: {output_sizes}."
        )

    if len(set(output_dtypes)) > 1:
        raise ValueError(
            "All distributions must produce samples with the same data type."
            f" Got distribution.rvs(size=1).dtype outputs: {output_dtypes}."
        )

    return distributions, output_sizes[0], output_dtypes[0]



[docs]
def generate_piecewise_data(
    distributions: scipy.stats.rv_continuous
    | scipy.stats.rv_discrete
    | list[scipy.stats.rv_continuous]
    | list[scipy.stats.rv_discrete]
    | None = None,
    lengths: int | list[int] | np.ndarray | None = None,
    *,
    n_segments: int = 3,
    n_samples: int = 100,
    seed: int | np.random.Generator | None = None,
    return_params: bool = False,
) -> pd.DataFrame | tuple[pd.DataFrame, dict]:
    """Generate data with a piecewise constant distribution.

    Generate piecewise segments of data from `scipy.stats` distributions, where
    unspecified parameters are randomly generated.

    Parameters
    ----------
    distributions : list of `scipy.stats.rv_continuous` or `scipy.stats.rv_discrete`, optional (default=None)
        The distributions for generating piecewise data.
        They are recycled to match the number of segments specified by `lengths` or
        `n_segments`.
        If None, alternating segments of `scipy.stats.norm()` and `scipy.stats.norm(5)`
        are used. Each distribution is expected to be a scipy distribution instance
        (e.g., `scipy.stats.norm`, `scipy.stats.uniform`). See
        `scipy.stats <https://docs.scipy.org/doc/scipy/reference/stats.html>`_
        for a list of all available distributions.

    lengths : int, list of int or np.ndarray, optional (default=None)
        The segment lengths. There are three possible cases:

        1. `list` or `numpy array`: Custom set of segment lengths.
        2. `int`: Length of `n_segments` equal segments.
        3. `None`: Generate `n_segments` random segment lengths with a total sample size
           of `n_samples`.

    n_segments : int (default=3)
        Number of segments to generate if `lengths` is an integer or None.

    n_samples : int (default=100)
        Total number of samples to generate if `lengths` is not specified.

    seed : np.random.Generator | int | None, optional
        Seed for the random number generator or a numpy random generator instance.
        If specified, this ensures reproducible output across multiple calls.

    return_params : bool, optional (default=False)
        If True, the function returns a tuple of the generated DataFrame and a
        dictionary with the parameters used to generate the data.

    Returns
    -------
    pd.DataFrame
        Data frame with generated data.

    dict, optional
        A dictionary containing the parameters used to generate the data. Only returned
        if `return_params` is True. It has the following keys:

        * `"n_segments"` : number of segments generated.
        * `"n_samples"` : total number of samples generated.
        * `"distributions"` : list of `scipy.stats.rv_continuous` or
          `scipy.stats.rv_discrete` with the distributions used for each segment.
        * `"lengths"` : list of lengths for each segment.
        * `"change_points"` : list of change points, which are the starting indices
          of each segment in the data.

    Examples
    --------
    >>> # Example 1: Two normal segments
    >>> from skchange.datasets import generate_piecewise_data
    >>> from scipy.stats import norm
    >>> generate_piecewise_data(
    ...     distributions=[norm(0, 1), norm(10, 0.1)],
    ...     lengths=[7, 3],
    ...     seed=1,
    ... )
               0
    0   0.345584
    1   0.821618
    2   0.330437
    3  -1.303157
    4   0.905356
    5   0.446375
    6  -0.536953
    7  10.058112
    8  10.036457
    9  10.029413

    >>> # Example 2: Two Poisson segments
    >>> from scipy.stats import poisson
    >>> generate_piecewise_data(
    ...     distributions=[poisson(1), poisson(10)],
    ...     lengths=[5, 5],
    ...     seed=2,
    ... )
        0
    0   0
    1   0
    2   1
    3   2
    4   0
    5   8
    6  11
    7   9
    8   9
    9   9


    >>> # Example 3: Specify int lengths and n_segments
    >>> generate_piecewise_data(
    ...     distributions=[norm(0), norm(5)],
    ...     lengths=3,
    ...     n_segments=3,
    ...     seed=3,
    ... )
              0
    0  2.040919
    1 -2.555665
    2  0.418099
    3  4.432230
    4  4.547351
    5  4.784403
    6 -2.019986
    7 -0.231932
    8 -0.865213
    """  # noqa: E501
    random_generator = check_random_generator(seed)
    lengths = check_segment_lengths(
        lengths, n_segments, n_samples, seed=random_generator
    )
    n_segments = len(lengths)
    n_samples = np.sum(lengths)
    distributions, n_variables, dtype = _check_distributions(distributions, n_segments)

    ends = np.cumsum(lengths)
    starts = np.concatenate(([0], ends[:-1]))
    generated_values = np.empty((n_samples, n_variables), dtype=dtype)
    for distribution, start, end in zip(distributions, starts, ends):
        length = end - start
        values = distribution.rvs(size=length, random_state=random_generator)
        generated_values[start:end, :] = values.reshape(length, n_variables)

    generated_df = pd.DataFrame(generated_values)

    if return_params:
        params = {
            "n_segments": n_segments,
            "n_samples": n_samples,
            "lengths": lengths,
            "distributions": distributions,
            "change_points": starts[1:],
        }
        return generated_df, params

    return generated_df