Source code for skchange.datasets._generate_regression

"""Data generators for regression data."""

import numpy as np
import pandas as pd
from sklearn.datasets import make_regression

from ..utils.validation.generation import check_random_generator, check_segment_lengths


[docs] def generate_piecewise_regression_data( lengths: int | list[int] | np.ndarray | None = None, *, n_segments: int = 3, n_samples: int = 100, n_features: int = 1, n_informative: int = 1, n_targets: int = 1, bias: float = 0.0, effective_rank: int | None = None, tail_strength: float = 0.5, noise: float = 1.0, shuffle: bool = True, seed: int | np.random.Generator | None = None, return_params: bool = False, ) -> ( tuple[pd.DataFrame, list[str], list[str]] | tuple[pd.DataFrame, list[str], list[str], dict] ): """Generate piecewise linear regression data. Generate independent segments of data from `sklearn.datasets.make_regression`. Parameters ---------- lengths : int, list of int or np.ndarray, optional (default=None) The segment lengths. There are three possible cases: 1. `list` or `numpy array`: Custom set of segment lengths. 2. `int`: Length of `n_segments` equal segments. 3. `None`: Generate `n_segments` random segment lengths with a total sample size of `n_samples`. n_segments : int (default=3) Number of segments to generate if `lengths` is an integer or None. n_samples : int (default=100) Total number of samples to generate if `lengths` is not specified. n_features : int The total number of features. n_informative : int The number of informative features. n_targets : int The number of target variables. bias : float The bias term in the linear model. Used across all segments. effective_rank : int | None The effective rank of the feature matrix. Used across all segments. tail_strength : float The tail strength of the noise distribution. Used across all segments. noise : float The standard deviation of the Gaussian noise applied to the output. Used across all segments. shuffle : bool Whether to shuffle the samples and features per segment. seed : np.random.Generator | int | None, optional Seed for the random number generator or a numpy random generator instance. If specified, this ensures reproducible output across multiple calls. return_params : bool Whether to return the parameters used for data generation. Returns ------- pd.DataFrame The generated data as a DataFrame with columns named "feature_0", "feature_1", ..., "target_0", "target_1", ... list[str] A list of feature column names. list[str] A list of target column names. dict, optional If `return_params` is True, a dictionary containing the parameters used to generate the data, including segment lengths, coefficients, change points, total number of samples and total number of segments. """ random_generator = check_random_generator(seed) lengths = check_segment_lengths( lengths, n_segments, n_samples, seed=random_generator ) n_segments = len(lengths) n_samples = np.sum(lengths) # make_regression requires a np.random.RandomState instance. random_generator = np.random.RandomState(random_generator.integers(0, 2**32 - 1)) ends = np.cumsum(lengths) starts = np.concatenate(([0], ends[:-1])) generated_x = np.empty((n_samples, n_features), dtype=np.float64) generated_y = np.empty((n_samples, n_targets), dtype=np.float64) coefs = [] for start, end in zip(starts, ends): segment_length = end - start x, y, coef = make_regression( n_samples=segment_length, n_features=n_features, n_informative=n_informative, n_targets=n_targets, bias=bias, effective_rank=effective_rank, tail_strength=tail_strength, noise=noise, shuffle=shuffle, coef=True, random_state=random_generator, ) generated_x[start:end, :] = x generated_y[start:end, :] = y.reshape(segment_length, n_targets) coefs.append(coef) feature_cols = [f"feature_{i}" for i in range(n_features)] target_cols = [f"target_{i}" for i in range(n_targets)] generated_df = pd.DataFrame( np.concatenate((generated_x, generated_y), axis=1), columns=feature_cols + target_cols, ) if return_params: params = { "n_segments": n_segments, "n_samples": n_samples, "lengths": lengths, "coefs": coefs, "change_points": starts[1:], } return generated_df, feature_cols, target_cols, params return generated_df, feature_cols, target_cols