diff --git a/.gitignore b/.gitignore index c5f32eeaa1..aa9833b22d 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,5 @@ evidently/examples/.DS_Store dist build MANIFEST + +__pycache__ \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000000..d10e8fefd2 --- /dev/null +++ b/config.json @@ -0,0 +1,22 @@ +{ + "data_format": { + "separator": ",", + "header": true, + "date_column": "dteday" + }, + "column_mapping" : {}, + "profile_sections": ["data_drift"], + "pretty_print": true, + "sampling": { + "reference": { + "type": "none", + "n": 1, + "ratio": 0.1 + }, + "current": { + "type": "nth", + "n": 2, + "ratio": 0.1 + } + } +} \ No newline at end of file diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000000..f79d65ee38 --- /dev/null +++ b/config.yaml @@ -0,0 +1,18 @@ +data_format: + separator: "," + header: true + date_column: "dteday" +column_mapping: {} +profile_sections: + - "data_drift" +pretty_print: true +sampling: + reference: + type: "simple" # could be "none", "simple", "random" + n: 5 # used with simple sampling, number of rows to skip + ratio: 0.1 # used with random sampling, part of data to take from chunk + random_seed: 4 # used with random sampling, used as seed for random generator + current: + type: "nth" # could be "none", "simple", "random" + n: 5 # used with simple sampling, number of rows to skip + ratio: 0.1 # used with random sampling, part of data to take from chunk \ No newline at end of file diff --git a/evidently/__main__.py b/evidently/__main__.py index fee7d9b33e..d09d4758d7 100644 --- a/evidently/__main__.py +++ b/evidently/__main__.py @@ -1,12 +1,16 @@ import argparse import json +import logging import os import sys from typing import Dict, List from dataclasses import dataclass +import yaml + from evidently.runner.dashboard_runner import DashboardRunnerOptions, DashboardRunner +from evidently.runner.loader import SamplingOptions from evidently.runner.profile_runner import ProfileRunner, ProfileRunnerOptions from evidently.runner.runner import DataOptions @@ -18,10 +22,17 @@ class DataFormatOptions: date_column: str +@dataclass +class Sampling: + reference: SamplingOptions + current: SamplingOptions + + @dataclass class CalculateOptions: data_format: DataFormatOptions column_mapping: Dict[str, str] + sampling: Sampling @dataclass @@ -35,22 +46,42 @@ class ProfileOptions(CalculateOptions): pretty_print: bool = False +def __get_not_none(d, key, default): + return default if d.get(key, None) is None else d.get(key) + + def calculate_dashboard(config: str, reference: str, current: str, output_path: str, report_name: str, **_kv): with open(config) as f_config: - opts_data = json.load(f_config) + if config.endswith(".yaml") or config.endswith(".yml"): + opts_data = yaml.load(f_config, Loader=yaml.SafeLoader) + elif config.endswith(".json"): + opts_data = json.load(f_config) + else: + raise Exception(f"config .{config.split('.')[-1]} not supported") + + sampling = __get_not_none(opts_data, "sampling", {}) + ref_sampling = __get_not_none(sampling, "reference", {}) + cur_sampling = __get_not_none(sampling, "current", {}) + opts = DashboardOptions(data_format=DataFormatOptions(**opts_data["data_format"]), column_mapping=opts_data["column_mapping"], - dashboard_tabs=opts_data["dashboard_tabs"]) + dashboard_tabs=opts_data["dashboard_tabs"], + sampling=Sampling( + reference=SamplingOptions(**ref_sampling), + current=SamplingOptions(**cur_sampling), + )) runner = DashboardRunner(DashboardRunnerOptions( reference_data_path=reference, reference_data_options=DataOptions(date_column=opts.data_format.date_column, separator=opts.data_format.separator, header=opts.data_format.header), - production_data_path=current, - production_data_options=DataOptions(date_column=opts.data_format.date_column, - separator=opts.data_format.separator, - header=opts.data_format.header), + reference_data_sampling=opts.sampling.reference, + current_data_path=current, + current_data_options=DataOptions(date_column=opts.data_format.date_column, + separator=opts.data_format.separator, + header=opts.data_format.header), + current_data_sampling=opts.sampling.current, dashboard_tabs=opts.dashboard_tabs, column_mapping=opts.column_mapping, output_path=os.path.join(output_path, report_name), @@ -60,21 +91,37 @@ def calculate_dashboard(config: str, reference: str, current: str, output_path: def calculate_profile(config: str, reference: str, current: str, output_path: str, report_name: str, **_kv): with open(config) as f_config: - opts_data = json.load(f_config) + if config.endswith(".yaml") or config.endswith(".yml"): + opts_data = yaml.load(f_config, Loader=yaml.SafeLoader) + elif config.endswith(".json"): + opts_data = json.load(f_config) + else: + raise Exception(f"config .{config.split('.')[-1]} not supported") + + sampling = __get_not_none(opts_data, "sampling", {}) + ref_sampling = __get_not_none(sampling, "reference", {}) + cur_sampling = __get_not_none(sampling, "current", {}) + opts = ProfileOptions(data_format=DataFormatOptions(**opts_data["data_format"]), column_mapping=opts_data["column_mapping"], profile_parts=opts_data["profile_sections"], - pretty_print=opts_data["pretty_print"]) + pretty_print=opts_data["pretty_print"], + sampling=Sampling( + reference=SamplingOptions(**ref_sampling), + current=SamplingOptions(**cur_sampling), + )) runner = ProfileRunner(ProfileRunnerOptions( reference_data_path=reference, reference_data_options=DataOptions(date_column=opts.data_format.date_column, separator=opts.data_format.separator, header=opts.data_format.header), - production_data_path=current, - production_data_options=DataOptions(date_column=opts.data_format.date_column, - separator=opts.data_format.separator, - header=opts.data_format.header), + reference_data_sampling=opts.sampling.reference, + current_data_path=current, + current_data_options=DataOptions(date_column=opts.data_format.date_column, + separator=opts.data_format.separator, + header=opts.data_format.header), + current_data_sampling=opts.sampling.current, profile_parts=opts.profile_parts, column_mapping=opts.column_mapping, output_path=os.path.join(output_path, report_name), @@ -92,10 +139,13 @@ def _add_default_parameters(configurable_parser, default_output_name: str): configurable_parser.add_argument("--reference", dest="reference", required=True, help="Path to reference data") configurable_parser.add_argument("--current", dest="current", help="Path to current data") configurable_parser.add_argument("--output_path", dest="output_path", required=True, help="Path to store report") - configurable_parser.add_argument("--report_name", dest="report_name", default=default_output_name, help="Report name") + configurable_parser.add_argument("--report_name", dest="report_name", default=default_output_name, + help="Report name") configurable_parser.add_argument("--config", dest="config", required=True, help="Path to configuration") +logging.basicConfig(level=logging.INFO) + parser = argparse.ArgumentParser() parsers = parser.add_subparsers() diff --git a/evidently/analyzers/cat_target_drift_analyzer.py b/evidently/analyzers/cat_target_drift_analyzer.py index d301b51a55..165dbeaa07 100644 --- a/evidently/analyzers/cat_target_drift_analyzer.py +++ b/evidently/analyzers/cat_target_drift_analyzer.py @@ -10,7 +10,7 @@ class CatTargetDriftAnalyzer(Analyzer): - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping): result = dict() if column_mapping: date_column = column_mapping.get('datetime') @@ -49,25 +49,25 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) ref_feature_vc = reference_data[target_column].value_counts() - prod_feature_vc = production_data[target_column].value_counts() + current_feature_vc = current_data[target_column].value_counts() keys = set(list(reference_data[target_column].unique()) + - list(production_data[target_column].unique())) + list(current_data[target_column].unique())) ref_feature_dict = dict.fromkeys(keys, 0) for key, item in zip(ref_feature_vc.index, ref_feature_vc.values): ref_feature_dict[key] = item - prod_feature_dict = dict.fromkeys(keys, 0) - for key, item in zip(prod_feature_vc.index, prod_feature_vc.values): - prod_feature_dict[key] = item + current_feature_dict = dict.fromkeys(keys, 0) + for key, item in zip(current_feature_vc.index, current_feature_vc.values): + current_feature_dict[key] = item f_exp = [value[1] for value in sorted(ref_feature_dict.items())] - f_obs = [value[1] for value in sorted(prod_feature_dict.items())] + f_obs = [value[1] for value in sorted(current_feature_dict.items())] target_p_value = chisquare(f_exp, f_obs)[1] result['metrics']["target_name"] = target_column @@ -80,25 +80,25 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) ref_feature_vc = reference_data[prediction_column].value_counts() - prod_feature_vc = production_data[prediction_column].value_counts() + current_feature_vc = current_data[prediction_column].value_counts() keys = set(list(reference_data[prediction_column].unique()) + - list(production_data[prediction_column].unique())) + list(current_data[prediction_column].unique())) ref_feature_dict = dict.fromkeys(keys, 0) for key, item in zip(ref_feature_vc.index, ref_feature_vc.values): ref_feature_dict[key] = item - prod_feature_dict = dict.fromkeys(keys, 0) - for key, item in zip(prod_feature_vc.index, prod_feature_vc.values): - prod_feature_dict[key] = item + current_feature_dict = dict.fromkeys(keys, 0) + for key, item in zip(current_feature_vc.index, current_feature_vc.values): + current_feature_dict[key] = item f_exp = [value[1] for value in sorted(ref_feature_dict.items())] - f_obs = [value[1] for value in sorted(prod_feature_dict.items())] + f_obs = [value[1] for value in sorted(current_feature_dict.items())] pred_p_value = chisquare(f_exp, f_obs)[1] result['metrics']["prediction_name"] = prediction_column diff --git a/evidently/analyzers/classification_performance_analyzer.py b/evidently/analyzers/classification_performance_analyzer.py index f06ce395d3..2d721d55f1 100644 --- a/evidently/analyzers/classification_performance_analyzer.py +++ b/evidently/analyzers/classification_performance_analyzer.py @@ -10,7 +10,7 @@ from sklearn import metrics class ClassificationPerformanceAnalyzer(Analyzer): - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping): result = dict() if column_mapping: date_column = column_mapping.get('datetime') @@ -83,18 +83,18 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, result['metrics']['reference']['confusion_matrix']['labels'] = labels result['metrics']['reference']['confusion_matrix']['values'] = conf_matrix.tolist() - if production_data is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + if current_data is not None: + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) result['metrics']['current'] = {} - accuracy_score = metrics.accuracy_score(production_data[target_column], production_data[prediction_column]) - avg_precision = metrics.precision_score(production_data[target_column], production_data[prediction_column], + accuracy_score = metrics.accuracy_score(current_data[target_column], current_data[prediction_column]) + avg_precision = metrics.precision_score(current_data[target_column], current_data[prediction_column], average='macro') - avg_recall = metrics.recall_score(production_data[target_column], production_data[prediction_column], + avg_recall = metrics.recall_score(current_data[target_column], current_data[prediction_column], average='macro') - avg_f1 = metrics.f1_score(production_data[target_column], production_data[prediction_column], + avg_f1 = metrics.f1_score(current_data[target_column], current_data[prediction_column], average='macro') result['metrics']['current']['accuracy'] = accuracy_score @@ -103,15 +103,15 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, result['metrics']['current']['f1'] = avg_f1 #calculate class support and metrics matrix - metrics_matrix = metrics.classification_report(production_data[target_column], production_data[prediction_column], + metrics_matrix = metrics.classification_report(current_data[target_column], current_data[prediction_column], output_dict=True) result['metrics']['current']['metrics_matrix'] = metrics_matrix #calculate confusion matrix - conf_matrix = metrics.confusion_matrix(production_data[target_column], - production_data[prediction_column]) - labels = target_names if target_names else sorted(set(production_data[target_column])) + conf_matrix = metrics.confusion_matrix(current_data[target_column], + current_data[prediction_column]) + labels = target_names if target_names else sorted(set(current_data[target_column])) result['metrics']['current']['confusion_matrix'] = {} result['metrics']['current']['confusion_matrix']['labels'] = labels diff --git a/evidently/analyzers/data_drift_analyzer.py b/evidently/analyzers/data_drift_analyzer.py index 4da35782e3..9dfb15e581 100644 --- a/evidently/analyzers/data_drift_analyzer.py +++ b/evidently/analyzers/data_drift_analyzer.py @@ -10,7 +10,7 @@ class DataDriftAnalyzer(Analyzer): - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping): result = dict() if column_mapping: date_column = column_mapping.get('datetime') @@ -47,37 +47,37 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, result['metrics'] = {} for feature_name in num_feature_names: result['metrics'][feature_name] = dict( - prod_small_hist=[t.tolist() for t in np.histogram(production_data[feature_name][np.isfinite(production_data[feature_name])], + current_small_hist=[t.tolist() for t in np.histogram(current_data[feature_name][np.isfinite(current_data[feature_name])], bins=10, density=True)], ref_small_hist=[t.tolist() for t in np.histogram(reference_data[feature_name][np.isfinite(reference_data[feature_name])], bins=10, density=True)], feature_type='num', - p_value=ks_2samp(reference_data[feature_name], production_data[feature_name])[1] + p_value=ks_2samp(reference_data[feature_name], current_data[feature_name])[1] ) for feature_name in cat_feature_names: ref_feature_vc = reference_data[feature_name][np.isfinite(reference_data[feature_name])].value_counts() - prod_feature_vc = production_data[feature_name][np.isfinite(production_data[feature_name])].value_counts() + current_feature_vc = current_data[feature_name][np.isfinite(current_data[feature_name])].value_counts() keys = set(list(reference_data[feature_name][np.isfinite(reference_data[feature_name])].unique()) + - list(production_data[feature_name][np.isfinite(production_data[feature_name])].unique())) + list(current_data[feature_name][np.isfinite(current_data[feature_name])].unique())) ref_feature_dict = dict.fromkeys(keys, 0) for key, item in zip(ref_feature_vc.index, ref_feature_vc.values): ref_feature_dict[key] = item - prod_feature_dict = dict.fromkeys(keys, 0) - for key, item in zip(prod_feature_vc.index, prod_feature_vc.values): - prod_feature_dict[key] = item + current_feature_dict = dict.fromkeys(keys, 0) + for key, item in zip(current_feature_vc.index, current_feature_vc.values): + current_feature_dict[key] = item f_exp = [value[1] for value in sorted(ref_feature_dict.items())] - f_obs = [value[1] for value in sorted(prod_feature_dict.items())] + f_obs = [value[1] for value in sorted(current_feature_dict.items())] # CHI2 to be implemented for cases with different categories p_value = chisquare(f_exp, f_obs)[1] result['metrics'][feature_name] = dict( - prod_small_hist=[t.tolist() for t in np.histogram(production_data[feature_name][np.isfinite(production_data[feature_name])], + current_small_hist=[t.tolist() for t in np.histogram(current_data[feature_name][np.isfinite(current_data[feature_name])], bins=10, density=True)], ref_small_hist=[t.tolist() for t in np.histogram(reference_data[feature_name][np.isfinite(reference_data[feature_name])], bins=10, density=True)], diff --git a/evidently/analyzers/num_target_drift_analyzer.py b/evidently/analyzers/num_target_drift_analyzer.py index 47a5159724..e133b324d7 100644 --- a/evidently/analyzers/num_target_drift_analyzer.py +++ b/evidently/analyzers/num_target_drift_analyzer.py @@ -10,7 +10,7 @@ class NumTargetDriftAnalyzer(Analyzer): - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping): result = dict() if column_mapping: date_column = column_mapping.get('datetime') @@ -47,28 +47,28 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, #target if target_column is not None: #drift - target_p_value = ks_2samp(reference_data[target_column], production_data[target_column])[1] + target_p_value = ks_2samp(reference_data[target_column], current_data[target_column])[1] result['metrics']["target_name"] = target_column result['metrics']["target_type"] = 'num' result['metrics']["target_drift"] = target_p_value #corr ref_target_corr = reference_data[num_feature_names + [target_column]].corr()[target_column] - curr_target_corr = production_data[num_feature_names + [target_column]].corr()[target_column] + curr_target_corr = current_data[num_feature_names + [target_column]].corr()[target_column] target_corr = {'reference':ref_target_corr.to_dict(), 'current':curr_target_corr.to_dict()} result['metrics']['target_correlations'] = target_corr #prediction if prediction_column is not None: #drift - pred_p_value = ks_2samp(reference_data[prediction_column], production_data[prediction_column])[1] + pred_p_value = ks_2samp(reference_data[prediction_column], current_data[prediction_column])[1] result['metrics']["prediction_name"] = prediction_column result['metrics']["prediction_type"] = 'num' result['metrics']["prediction_drift"] = pred_p_value #corr ref_pred_corr = reference_data[num_feature_names + [prediction_column]].corr()[prediction_column] - curr_pred_corr = production_data[num_feature_names + [prediction_column]].corr()[prediction_column] + curr_pred_corr = current_data[num_feature_names + [prediction_column]].corr()[prediction_column] prediction_corr = {'reference':ref_pred_corr.to_dict(), 'current':curr_pred_corr.to_dict()} result['metrics']['prediction_correlations'] = prediction_corr diff --git a/evidently/analyzers/prob_classification_performance_analyzer.py b/evidently/analyzers/prob_classification_performance_analyzer.py index 473a92ef0c..5ffad9e9a2 100644 --- a/evidently/analyzers/prob_classification_performance_analyzer.py +++ b/evidently/analyzers/prob_classification_performance_analyzer.py @@ -10,7 +10,7 @@ from sklearn import metrics, preprocessing class ProbClassificationPerformanceAnalyzer(Analyzer): - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping): result = dict() if column_mapping: date_column = column_mapping.get('datetime') @@ -171,15 +171,15 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, pr_table.append([top, int(count), prob, int(tp), int(fp), precision, recall]) result['metrics']['reference']['pr_curve'][label] = pr_table - if production_data is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + if current_data is not None: + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(reference_data[target_column]) - binaraized_target = binaraizer.transform(production_data[target_column]) + binaraized_target = binaraizer.transform(current_data[target_column]) - array_prediction = production_data[prediction_column].to_numpy() + array_prediction = current_data[prediction_column].to_numpy() prediction_ids = np.argmax(array_prediction, axis=-1) prediction_labels = [prediction_column[x] for x in prediction_ids] @@ -191,15 +191,15 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, roc_auc = metrics.roc_auc_score(binaraized_target, array_prediction, average='macro') log_loss = metrics.log_loss(binaraized_target, array_prediction) else: - roc_auc = metrics.roc_auc_score(binaraized_target, production_data[prediction_column[0]]) #problem!!! - log_loss = metrics.log_loss(binaraized_target, production_data[prediction_column[0]]) #problem!!! + roc_auc = metrics.roc_auc_score(binaraized_target, current_data[prediction_column[0]]) #problem!!! + log_loss = metrics.log_loss(binaraized_target, current_data[prediction_column[0]]) #problem!!! - accuracy_score = metrics.accuracy_score(production_data[target_column], prediction_labels) - avg_precision = metrics.precision_score(production_data[target_column], prediction_labels, + accuracy_score = metrics.accuracy_score(current_data[target_column], prediction_labels) + avg_precision = metrics.precision_score(current_data[target_column], prediction_labels, average='macro') - avg_recall = metrics.recall_score(production_data[target_column], prediction_labels, + avg_recall = metrics.recall_score(current_data[target_column], prediction_labels, average='macro') - avg_f1 = metrics.f1_score(production_data[target_column], prediction_labels, + avg_f1 = metrics.f1_score(current_data[target_column], prediction_labels, average='macro') result['metrics']['current']['accuracy'] = accuracy_score @@ -210,7 +210,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, result['metrics']['current']['log_loss'] = log_loss #calculate class support and metrics matrix - metrics_matrix = metrics.classification_report(production_data[target_column], prediction_labels, + metrics_matrix = metrics.classification_report(current_data[target_column], prediction_labels, output_dict=True) result['metrics']['current']['metrics_matrix'] = metrics_matrix @@ -219,7 +219,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, result['metrics']['current']['roc_aucs'] = roc_aucs.tolist() #calculate confusion matrix - conf_matrix = metrics.confusion_matrix(production_data[target_column], + conf_matrix = metrics.confusion_matrix(current_data[target_column], prediction_labels) result['metrics']['current']['confusion_matrix'] = {} @@ -229,20 +229,20 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, #calulate ROC and PR curves, PR table if len(prediction_column) <= 2: binaraizer = preprocessing.LabelBinarizer() - binaraizer.fit(production_data[target_column]) - binaraized_target = pd.DataFrame(binaraizer.transform(production_data[target_column])) + binaraizer.fit(current_data[target_column]) + binaraized_target = pd.DataFrame(binaraizer.transform(current_data[target_column])) binaraized_target.columns = ['target'] - fpr, tpr, thrs = metrics.roc_curve(binaraized_target, production_data[prediction_column[0]]) + fpr, tpr, thrs = metrics.roc_curve(binaraized_target, current_data[prediction_column[0]]) result['metrics']['current']['roc_curve'] = {'fpr':fpr.tolist(), 'tpr':tpr.tolist(), 'thrs':thrs.tolist()} - pr, rcl, thrs = metrics.precision_recall_curve(binaraized_target, production_data[prediction_column[0]]) + pr, rcl, thrs = metrics.precision_recall_curve(binaraized_target, current_data[prediction_column[0]]) result['metrics']['current']['pr_curve'] = {'pr':pr.tolist(), 'rcl':rcl.tolist(), 'thrs':thrs.tolist()} pr_table = [] step_size = 0.05 binded = list(zip(binaraized_target['target'].tolist(), - production_data[prediction_column[0]].tolist())) + current_data[prediction_column[0]].tolist())) binded.sort(key = lambda item: item[1], reverse = True) data_size = len(binded) target_class_size = sum([x[0] for x in binded]) @@ -260,23 +260,23 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, else: binaraizer = preprocessing.LabelBinarizer() - binaraizer.fit(production_data[target_column]) - binaraized_target = pd.DataFrame(binaraizer.transform(production_data[target_column])) + binaraizer.fit(current_data[target_column]) + binaraized_target = pd.DataFrame(binaraizer.transform(current_data[target_column])) binaraized_target.columns = prediction_column result['metrics']['current']['roc_curve'] = {} result['metrics']['current']['pr_curve'] = {} for label in prediction_column: - fpr, tpr, thrs = metrics.roc_curve(binaraized_target[label], production_data[label]) + fpr, tpr, thrs = metrics.roc_curve(binaraized_target[label], current_data[label]) result['metrics']['current']['roc_curve'][label] = {'fpr':fpr.tolist(), 'tpr':tpr.tolist(), 'thrs':thrs.tolist()} - pr, rcl, thrs = metrics.precision_recall_curve(binaraized_target[label], production_data[label]) + pr, rcl, thrs = metrics.precision_recall_curve(binaraized_target[label], current_data[label]) result['metrics']['current']['pr_curve'][label] = {'pr':pr.tolist(), 'rcl':rcl.tolist(), 'thrs':thrs.tolist()} pr_table = [] step_size = 0.05 binded = list(zip(binaraized_target[label].tolist(), - production_data[label].tolist())) + current_data[label].tolist())) binded.sort(key = lambda item: item[1], reverse = True) data_size = len(binded) target_class_size = sum([x[0] for x in binded]) diff --git a/evidently/analyzers/regression_performance_analyzer.py b/evidently/analyzers/regression_performance_analyzer.py index 4da39e5235..1e3df96260 100644 --- a/evidently/analyzers/regression_performance_analyzer.py +++ b/evidently/analyzers/regression_performance_analyzer.py @@ -10,7 +10,7 @@ from sklearn import metrics class RegressionPerformanceAnalyzer(Analyzer): - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping): result = dict() if column_mapping: date_column = column_mapping.get('datetime') @@ -97,19 +97,19 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, result['metrics']['reference']['underperformance']['underestimation'] = {'mean_error':float(mae_under), 'std_error':float(sd_under)} result['metrics']['reference']['underperformance']['overestimation'] = {'mean_error':float(mae_over), 'std_error':float(sd_over)} - if production_data is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + if current_data is not None: + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) #calculate quality metrics - me = np.mean(production_data[prediction_column] - production_data[target_column]) - sde = np.std(production_data[prediction_column] - production_data[target_column], ddof = 1) + me = np.mean(current_data[prediction_column] - current_data[target_column]) + sde = np.std(current_data[prediction_column] - current_data[target_column], ddof = 1) - abs_err = np.abs(production_data[prediction_column] - production_data[target_column]) + abs_err = np.abs(current_data[prediction_column] - current_data[target_column]) mae = np.mean(abs_err) sdae = np.std(abs_err, ddof = 1) - abs_perc_err = 100.*np.abs(production_data[prediction_column] - production_data[target_column])/production_data[target_column] + abs_perc_err = 100.*np.abs(current_data[prediction_column] - current_data[target_column])/current_data[target_column] mape = np.mean(abs_perc_err) sdape = np.std(abs_perc_err, ddof = 1) @@ -117,8 +117,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, 'error_std':float(sde), 'abs_error_std':float(sdae), 'abs_perc_error_std':float(sdape)} #error normality - prod_error = production_data[prediction_column] - production_data[target_column] - qq_lines = probplot(prod_error, dist="norm", plot=None) + current_error = current_data[prediction_column] - current_data[target_column] + qq_lines = probplot(current_error, dist="norm", plot=None) theoretical_q_x = np.linspace(qq_lines[0][0][0], qq_lines[0][0][-1], 100) qq_dots = [t.tolist() for t in qq_lines[0]] @@ -128,23 +128,23 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, 'order_statistic_medians':[float(x) for x in qq_dots[1]], 'slope':float(qq_line[0]), 'intercept':float(qq_line[1]), 'r':float(qq_line[2])} #underperformance metrics - prod_quantile_5 = np.quantile(prod_error, .05) - prod_quantile_95 = np.quantile(prod_error, .95) + current_quantile_5 = np.quantile(current_error, .05) + current_quantile_95 = np.quantile(current_error, .95) - prod_mae = np.mean(prod_error) - prod_mae_under = np.mean(prod_error[prod_error <= prod_quantile_5]) - prod_mae_exp = np.mean(prod_error[(prod_error > prod_quantile_5) & (prod_error < prod_quantile_95)]) - prod_mae_over = np.mean(prod_error[prod_error >= prod_quantile_95]) + current_mae = np.mean(current_error) + current_mae_under = np.mean(current_error[current_error <= pcurrent_quantile_5]) + current_mae_exp = np.mean(current_error[(current_error > current_quantile_5) & (current_error < current_quantile_95)]) + current_mae_over = np.mean(current_error[current_error >= current_quantile_95]) - prod_sd = np.std(prod_error, ddof = 1) - prod_sd_under = np.std(prod_error[prod_error <= prod_quantile_5], ddof = 1) - prod_sd_exp = np.std(prod_error[(prod_error > prod_quantile_5) & (prod_error < prod_quantile_95)], ddof = 1) - prod_sd_over = np.std(prod_error[prod_error >= prod_quantile_95], ddof = 1) + current_sd = np.std(current_error, ddof = 1) + current_sd_under = np.std(current_error[current_error <= current_quantile_5], ddof = 1) + current_sd_exp = np.std(current_error[(current_error > current_quantile_5) & (current_error < current_quantile_95)], ddof = 1) + current_sd_over = np.std(current_error[current_error >= current_quantile_95], ddof = 1) result['metrics']['current']['underperformance'] = {} - result['metrics']['current']['underperformance']['majority'] = {'mean_error':float(prod_mae_exp), 'std_error':float(prod_sd_exp)} - result['metrics']['current']['underperformance']['underestimation'] = {'mean_error':float(prod_mae_under), 'std_error':float(prod_sd_under)} - result['metrics']['current']['underperformance']['overestimation'] = {'mean_error':float(prod_mae_over), 'std_error':float(prod_sd_over)} + result['metrics']['current']['underperformance']['majority'] = {'mean_error':float(current_mae_exp), 'std_error':float(current_sd_exp)} + result['metrics']['current']['underperformance']['underestimation'] = {'mean_error':float(current_mae_under), 'std_error':float(current_sd_under)} + result['metrics']['current']['underperformance']['overestimation'] = {'mean_error':float(current_mae_over), 'std_error':float(current_sd_over)} #error bias table error_bias = {} @@ -157,15 +157,15 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, ref_over_value = np.mean(reference_data[error >= quantile_95][feature_name]) ref_range_value = 0 if ref_over_value == ref_under_value else 100*abs(ref_over_value - ref_under_value)/(np.max(reference_data[feature_name]) - np.min(reference_data[feature_name])) - prod_overal_value = np.mean(production_data[feature_name]) - prod_under_value = np.mean(production_data[prod_error <= prod_quantile_5][feature_name]) - prod_expected_value = np.mean(production_data[(prod_error > prod_quantile_5) & (prod_error < prod_quantile_95)][feature_name]) - prod_over_value = np.mean(production_data[prod_error >= prod_quantile_95][feature_name]) - prod_range_value = 0 if prod_over_value == prod_under_value else 100*abs(prod_over_value - prod_under_value)/(np.max(production_data[feature_name]) - np.min(production_data[feature_name])) + current_overal_value = np.mean(current_data[feature_name]) + current_under_value = np.mean(current_data[current_error <= current_quantile_5][feature_name]) + current_expected_value = np.mean(current_data[(current_error > current_quantile_5) & (current_error < current_quantile_95)][feature_name]) + current_over_value = np.mean(current_data[current_error >= current_quantile_95][feature_name]) + current_range_value = 0 if current_over_value == current_under_value else 100*abs(current_over_value - current_under_value)/(np.max(current_data[feature_name]) - np.min(current_data[feature_name])) error_bias[feature_name] = {'feature_type':feature_type, 'ref_majority':float(ref_expected_value), 'ref_under':float(ref_under_value), - 'ref_over':float(ref_over_value), 'ref_range':float(ref_range_value),'prod_majority':float(prod_expected_value), 'prod_under':float(prod_under_value), - 'prod_over':float(prod_over_value), 'prod_range':float(prod_range_value)} + 'ref_over':float(ref_over_value), 'ref_range':float(ref_range_value),'current_majority':float(current_expected_value), 'current_under':float(current_under_value), + 'current_over':float(current_over_value), 'current_range':float(current_range_value)} for feature_name in cat_feature_names: feature_type = 'cat' @@ -176,15 +176,15 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, ref_range_value = 1 if (ref_overal_value != ref_under_value) or (ref_over_value != ref_overal_value) \ or (ref_under_value != ref_overal_value) else 0 - prod_overal_value = production_data[feature_name].value_counts().idxmax() - prod_under_value = production_data[prod_error <= prod_quantile_5][feature_name].value_counts().idxmax() - prod_over_value = production_data[prod_error >= prod_quantile_95][feature_name].value_counts().idxmax() - prod_range_value = 1 if (prod_overal_value != prod_under_value) or (prod_over_value != prod_overal_value) \ - or (prod_under_value != prod_overal_value) else 0 + current_overal_value = current_data[feature_name].value_counts().idxmax() + current_under_value = current_data[current_error <= current_quantile_5][feature_name].value_counts().idxmax() + current_over_value = current_data[current_error >= current_quantile_95][feature_name].value_counts().idxmax() + current_range_value = 1 if (current_overal_value != current_under_value) or (current_over_value != current_overal_value) \ + or (current_under_value != current_overal_value) else 0 error_bias[feature_name] = {'feature_type':feature_type, 'ref_majority':float(ref_overal_value), 'ref_under':float(ref_under_value), - 'ref_over':float(ref_over_value), 'ref_range':float(ref_range_value),'prod_majority':float(prod_overal_value), 'prod_under':float(prod_under_value), - 'prod_over':float(prod_over_value), 'prod_range':float(prod_range_value)} + 'ref_over':float(ref_over_value), 'ref_range':float(ref_range_value),'current_majority':float(current_overal_value), 'current_under':float(current_under_value), + 'current_over':float(current_over_value), 'current_range':float(current_range_value)} result['metrics']['error_bias'] = error_bias diff --git a/evidently/dashboard/dashboard.py b/evidently/dashboard/dashboard.py index 7d8ace1a8d..89f3a8e5c7 100644 --- a/evidently/dashboard/dashboard.py +++ b/evidently/dashboard/dashboard.py @@ -17,6 +17,7 @@ from evidently.model.dashboard import DashboardInfo from evidently.pipeline.pipeline import Pipeline from evidently.tabs.base_tab import Tab +from evidently.utils import NumpyEncoder @dataclasses.dataclass() @@ -27,7 +28,7 @@ class TemplateParams: def __dashboard_info_to_json(di: DashboardInfo): - return json.dumps(asdict(di)) + return json.dumps(asdict(di), cls=NumpyEncoder) def inline_template(params: TemplateParams): @@ -136,11 +137,11 @@ def get_analyzers(self): def calculate(self, reference_data: pandas.DataFrame, - production_data: pandas.DataFrame, + current_data: pandas.DataFrame, column_mapping: dict = None): - self.execute(reference_data, production_data, column_mapping) + self.execute(reference_data, current_data, column_mapping) for tab in self.tabsData: - tab.calculate(reference_data, production_data, column_mapping, self.analyzers_results) + tab.calculate(reference_data, current_data, column_mapping, self.analyzers_results) def __render(self, template: typing.Callable[[TemplateParams], str]): dashboard_id = "evidently_dashboard_" + str(uuid.uuid4()).replace("-", "") @@ -159,7 +160,7 @@ def _json(self): dashboard_id = "evidently_dashboard_" + str(uuid.uuid4()).replace("-", "") tab_widgets = [t.info() for t in self.tabsData] di = DashboardInfo(dashboard_id, [item for tab in tab_widgets for item in tab if item is not None]) - return json.dumps(asdict(di)) + return json.dumps(asdict(di), cls=NumpyEncoder) def _save_to_json(self, filename): parent_dir = os.path.dirname(filename) diff --git a/evidently/examples/bicycle_demand_monitoring.ipynb b/evidently/examples/bicycle_demand_monitoring.ipynb index bc1e8642ca..ea9e34046d 100644 --- a/evidently/examples/bicycle_demand_monitoring.ipynb +++ b/evidently/examples/bicycle_demand_monitoring.ipynb @@ -429,7 +429,7 @@ "outputs": [], "source": [ "reference = raw_data.loc['2011-01-01 00:00:00':'2011-01-28 23:00:00']\n", - "production = raw_data.loc['2011-01-29 00:00:00':'2011-02-28 23:00:00']" + "current = raw_data.loc['2011-01-29 00:00:00':'2011-02-28 23:00:00']" ] }, { @@ -652,7 +652,7 @@ "outputs": [], "source": [ "ref_prediction = regressor.predict(reference[numerical_features + categorical_features])\n", - "prod_prediction = regressor.predict(production[numerical_features + categorical_features])" + "current_prediction = regressor.predict(current[numerical_features + categorical_features])" ] }, { @@ -662,7 +662,7 @@ "outputs": [], "source": [ "reference['prediction'] = ref_prediction\n", - "production['prediction'] = prod_prediction" + "current['prediction'] = current_prediction" ] }, { @@ -717,22 +717,22 @@ "}\n", "\n", "\n", "\n", - "
Loading...
\n", + "
Loading...
\n", "\n" ], "text/plain": [ @@ -754,7 +754,7 @@ "metadata": {}, "outputs": [], "source": [ - "#regression_perfomance_dashboard.save('regression_performance_at_training.html')" + "#regression_perfomance_dashboard.save('reports/regression_performance_at_training.html')" ] }, { @@ -770,7 +770,7 @@ "metadata": {}, "outputs": [], "source": [ - "regression_perfomance_dashboard.calculate(reference, production.loc['2011-01-29 00:00:00':'2011-02-07 23:00:00'], \n", + "regression_perfomance_dashboard.calculate(reference, current.loc['2011-01-29 00:00:00':'2011-02-07 23:00:00'], \n", " column_mapping=column_mapping)" ] }, @@ -795,22 +795,22 @@ "}\n", "\n", "\n", "\n", - "
Loading...
\n", + "
Loading...
\n", "\n" ], "text/plain": [ @@ -832,7 +832,7 @@ "metadata": {}, "outputs": [], "source": [ - "#regression_perfomance_dashboard.save('regression_performance_after_week1.html')" + "#regression_perfomance_dashboard.save('reports/regression_performance_after_week1.html')" ] }, { @@ -842,7 +842,7 @@ "outputs": [], "source": [ "target_drift_dashboard = Dashboard(tabs=[NumTargetDriftTab])\n", - "target_drift_dashboard.calculate(reference, production.loc['2011-01-29 00:00:00':'2011-02-07 23:00:00'], \n", + "target_drift_dashboard.calculate(reference, current.loc['2011-01-29 00:00:00':'2011-02-07 23:00:00'], \n", " column_mapping=column_mapping)" ] }, @@ -867,22 +867,22 @@ "}\n", "\n", "\n", "\n", - "
Loading...
\n", + "
Loading...
\n", "\n" ], "text/plain": [ @@ -904,7 +904,7 @@ "metadata": {}, "outputs": [], "source": [ - "#target_drift_dashboard.save('target_drift_after_week1.html')" + "#target_drift_dashboard.save('reports/target_drift_after_week1.html')" ] }, { @@ -920,7 +920,7 @@ "metadata": {}, "outputs": [], "source": [ - "regression_perfomance_dashboard.calculate(reference, production.loc['2011-02-07 00:00:00':'2011-02-14 23:00:00'], \n", + "regression_perfomance_dashboard.calculate(reference, current.loc['2011-02-07 00:00:00':'2011-02-14 23:00:00'], \n", " column_mapping=column_mapping)" ] }, @@ -945,22 +945,22 @@ "}\n", "\n", "\n", "\n", - "
Loading...
\n", + "
Loading...
\n", "\n" ], "text/plain": [ @@ -982,7 +982,7 @@ "metadata": {}, "outputs": [], "source": [ - "#regression_perfomance_dashboard.save('regression_performance_after_week2.html')" + "#regression_perfomance_dashboard.save('reports/regression_performance_after_week2.html')" ] }, { @@ -991,7 +991,7 @@ "metadata": {}, "outputs": [], "source": [ - "target_drift_dashboard.calculate(reference, production.loc['2011-02-07 00:00:00':'2011-02-14 23:00:00'], \n", + "target_drift_dashboard.calculate(reference, current.loc['2011-02-07 00:00:00':'2011-02-14 23:00:00'], \n", " column_mapping=column_mapping)" ] }, @@ -1016,22 +1016,22 @@ "}\n", "\n", "\n", "\n", - "
Loading...
\n", + "
Loading...
\n", "\n" ], "text/plain": [ @@ -1053,7 +1053,7 @@ "metadata": {}, "outputs": [], "source": [ - "#target_drift_dashboard.save('target_drift_after_week2.html')" + "#target_drift_dashboard.save('reports/target_drift_after_week2.html')" ] }, { @@ -1069,7 +1069,7 @@ "metadata": {}, "outputs": [], "source": [ - "regression_perfomance_dashboard.calculate(reference, production.loc['2011-02-15 00:00:00':'2011-02-21 23:00:00'], \n", + "regression_perfomance_dashboard.calculate(reference, current.loc['2011-02-15 00:00:00':'2011-02-21 23:00:00'], \n", " column_mapping=column_mapping)" ] }, @@ -1094,22 +1094,22 @@ "}\n", "\n", "\n", "\n", - "
Loading...
\n", + "
Loading...
\n", "\n" ], "text/plain": [ @@ -1131,7 +1131,7 @@ "metadata": {}, "outputs": [], "source": [ - "#regression_perfomance_dashboard.save('regression_performance_after_week3.html')" + "#regression_perfomance_dashboard.save('reports/regression_performance_after_week3.html')" ] }, { @@ -1140,7 +1140,7 @@ "metadata": {}, "outputs": [], "source": [ - "target_drift_dashboard.calculate(reference, production.loc['2011-02-15 00:00:00':'2011-02-21 23:00:00'], \n", + "target_drift_dashboard.calculate(reference, current.loc['2011-02-15 00:00:00':'2011-02-21 23:00:00'], \n", " column_mapping=column_mapping)" ] }, @@ -1165,22 +1165,22 @@ "}\n", "\n", "\n", "\n", - "
Loading...
\n", + "
Loading...
\n", "\n" ], "text/plain": [ @@ -1202,7 +1202,7 @@ "metadata": {}, "outputs": [], "source": [ - "#target_drift_dashboard.save('target_drift_after_week3.html')" + "#target_drift_dashboard.save('reports/target_drift_after_week3.html')" ] }, { @@ -1225,18 +1225,18 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "data_drift_dashboard = Dashboard(tabs=[DataDriftTab])\n", - "data_drift_dashboard.calculate(reference, production.loc['2011-01-29 00:00:00':'2011-02-07 23:00:00'], \n", + "data_drift_dashboard.calculate(reference, current.loc['2011-01-29 00:00:00':'2011-02-07 23:00:00'], \n", " column_mapping=column_mapping)" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -1253,29 +1253,29 @@ "}\n", "\n", "\n", "\n", - "
Loading...
\n", + "
Loading...
\n", "\n" ], "text/plain": [ "" ] }, - "execution_count": 38, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -1286,11 +1286,11 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ - "#data_drift_dashboard.save(\"data_drift_dashboard_after_week1.html\")" + "#data_drift_dashboard.save(\"reports/data_drift_dashboard_after_week1.html\")" ] } ], diff --git a/evidently/model_profile/model_profile.py b/evidently/model_profile/model_profile.py index 0c38311155..e48e275b1a 100644 --- a/evidently/model_profile/model_profile.py +++ b/evidently/model_profile/model_profile.py @@ -1,10 +1,13 @@ import json -import pandas + +import pandas +import numpy as np from datetime import datetime from typing import List, Type from evidently.pipeline.pipeline import Pipeline from evidently.profile_sections.base_profile_section import ProfileSection +from evidently.utils import NumpyEncoder class Profile(Pipeline): @@ -14,17 +17,17 @@ def __init__(self, sections: List[Type[ProfileSection]]): def calculate(self, reference_data: pandas.DataFrame, - production_data: pandas.DataFrame, + current_data: pandas.DataFrame, column_mapping: dict = None): - self.execute(reference_data, production_data, column_mapping) + self.execute(reference_data, current_data, column_mapping) def get_analyzers(self): return list(set([analyzer for tab in self.parts for analyzer in tab.analyzers()])) def json(self): - return json.dumps(self.object()) + return json.dumps(self.object(), cls=NumpyEncoder) def object(self): result = dict([(part.part_id(), part.calculate(self.analyzers_results)) for part in self.parts]) result["timestamp"] = str(datetime.now()) - return result + return result \ No newline at end of file diff --git a/evidently/pipeline/pipeline.py b/evidently/pipeline/pipeline.py index f005d3b082..89a2e1113a 100644 --- a/evidently/pipeline/pipeline.py +++ b/evidently/pipeline/pipeline.py @@ -12,7 +12,7 @@ def get_analyzers(self): def execute(self, reference_data: pandas.DataFrame, - production_data: pandas.DataFrame, + current_data: pandas.DataFrame, column_mapping: dict = None): for analyzer in self.get_analyzers(): - self.analyzers_results[analyzer] = analyzer().calculate(reference_data, production_data, column_mapping) + self.analyzers_results[analyzer] = analyzer().calculate(reference_data, current_data, column_mapping) diff --git a/evidently/profile/__pycache__/__init__.cpython-36.pyc b/evidently/profile/__pycache__/__init__.cpython-36.pyc deleted file mode 100644 index 6dff6567cb..0000000000 Binary files a/evidently/profile/__pycache__/__init__.cpython-36.pyc and /dev/null differ diff --git a/evidently/profile/__pycache__/data_drift_profile.cpython-36.pyc b/evidently/profile/__pycache__/data_drift_profile.cpython-36.pyc deleted file mode 100644 index 3ffa9dbe71..0000000000 Binary files a/evidently/profile/__pycache__/data_drift_profile.cpython-36.pyc and /dev/null differ diff --git a/evidently/profile/__pycache__/profile.cpython-36.pyc b/evidently/profile/__pycache__/profile.cpython-36.pyc deleted file mode 100644 index b1d72af241..0000000000 Binary files a/evidently/profile/__pycache__/profile.cpython-36.pyc and /dev/null differ diff --git a/evidently/runner/dashboard_runner.py b/evidently/runner/dashboard_runner.py index 1a74ee44c7..05435bba9f 100644 --- a/evidently/runner/dashboard_runner.py +++ b/evidently/runner/dashboard_runner.py @@ -28,7 +28,7 @@ def __init__(self, options: DashboardRunnerOptions): self.options = options def run(self): - (reference_data, production_data) = self._parse_data() + (reference_data, current_data) = self._parse_data() tabs = [] @@ -39,5 +39,5 @@ def run(self): tabs.append(tab_class) dashboard = Dashboard(tabs=tabs) - dashboard.calculate(reference_data, production_data, self.options.column_mapping) + dashboard.calculate(reference_data, current_data, self.options.column_mapping) dashboard.save(self.options.output_path + ".html") diff --git a/evidently/runner/loader.py b/evidently/runner/loader.py new file mode 100644 index 0000000000..6066a19599 --- /dev/null +++ b/evidently/runner/loader.py @@ -0,0 +1,90 @@ +import dataclasses +import logging +import random +from typing import Callable, Union, Optional, List + +import pandas as pd + + +@dataclasses.dataclass +class SamplingOptions: + type: str = "none" + random_seed: int = 1 + ratio: float = 1.0 + n: int = 1 + + +@dataclasses.dataclass +class DataOptions: + date_column: str + separator: str + # is csv file contains header row + header: bool + # should be list of names, or None if columns should be inferred from data + column_names: Optional[List[str]] + + def __init__(self, date_column: str = "datetime", separator=",", header=True, column_names=None): + self.date_column = date_column + self.header = header + self.separator = separator + self.column_names = column_names + + +def _skiprows(sampling_options: SamplingOptions) -> Union[Callable[[int], bool], None]: + if sampling_options.type == "none": + return None + if sampling_options.type == "nth": + if sampling_options.n < 1: + raise Exception("nth sampling should have 'n' parameter >= 1") + return __simple(sampling_options) + if sampling_options.type == "random": + sk = RandomizedSkipRows(sampling_options.ratio, sampling_options.random_seed) + return sk.skiprows + + +def __simple(sampling_options: SamplingOptions): + def func(row_idx): + if row_idx == 0: + result = False + else: + rem = row_idx % sampling_options.n + result = rem != 1 + return result + return func + + +class DataLoader: + def __init__(self): + pass + + def load(self, filename: str, data_options: DataOptions, sampling_options: SamplingOptions = None): + sampling_opts = SamplingOptions("none", 0, 0) if sampling_options is None else sampling_options + parse_dates = [data_options.date_column] \ + if data_options.date_column \ + else False + return pd.read_csv(filename, + header=0 if data_options.header else None, + sep=data_options.separator, + skiprows=_skiprows(sampling_opts), + parse_dates=parse_dates) + + +CHUNK_SIZE = 1000 + + +class RandomizedSkipRows: + def __init__(self, ratio: float, random_seed: int): + self.random = random.Random(random_seed) + self.ratio = ratio + self.selected_rows = self._select() + + def skiprows(self, row_index: int): + if row_index == 0: + return False + if row_index % CHUNK_SIZE == 0: + self.selected_rows = self._select() + idx = row_index - int(row_index / CHUNK_SIZE) * CHUNK_SIZE + return self.selected_rows[idx] + + def _select(self): + return [False if self.random.random() < self.ratio else True for x in range(1000)] diff --git a/evidently/runner/profile_runner.py b/evidently/runner/profile_runner.py index 3b8e354342..979141d7cd 100644 --- a/evidently/runner/profile_runner.py +++ b/evidently/runner/profile_runner.py @@ -10,6 +10,7 @@ from evidently.profile_sections.prob_classification_performance_profile_section import ProbClassificationPerformanceProfileSection from evidently.profile_sections.regression_performance_profile_section import RegressionPerformanceProfileSection from evidently.runner.runner import RunnerOptions, Runner +from evidently.utils import NumpyEncoder @dataclass @@ -27,13 +28,14 @@ class ProfileRunnerOptions(RunnerOptions): ) + class ProfileRunner(Runner): def __init__(self, options: ProfileRunnerOptions): super().__init__(options) self.options = options def run(self): - (reference_data, production_data) = self._parse_data() + (reference_data, current_data) = self._parse_data() parts = [] @@ -44,10 +46,10 @@ def run(self): parts.append(part_class) profile = Profile(sections=parts) - profile.calculate(reference_data, production_data, self.options.column_mapping) + profile.calculate(reference_data, current_data, self.options.column_mapping) output_path = self.options.output_path \ if self.options.output_path.endswith(".json") \ else self.options.output_path + ".json" with open(output_path, 'w') as f: - json.dump(profile.object(), f, indent=2 if self.options.pretty_print else None) + json.dump(profile.object(), f, indent=2 if self.options.pretty_print else None, cls=NumpyEncoder) diff --git a/evidently/runner/runner.py b/evidently/runner/runner.py index 297108b72d..02f20d9364 100644 --- a/evidently/runner/runner.py +++ b/evidently/runner/runner.py @@ -1,31 +1,19 @@ +import logging from typing import Optional, List, Dict from dataclasses import dataclass -import pandas as pd - - -class DataOptions: - date_column: str - separator: str - # is csv file contains header row - header: bool - # should be list of names, or None if columns should be inferred from data - column_names: Optional[List[str]] - - def __init__(self, date_column: str = "datetime", separator=",", header=True, column_names=None): - self.date_column = date_column - self.header = header - self.separator = separator - self.column_names = column_names +from evidently.runner.loader import DataLoader, SamplingOptions, DataOptions @dataclass class RunnerOptions: reference_data_path: str reference_data_options: DataOptions - production_data_path: Optional[str] - production_data_options: Optional[DataOptions] + reference_data_sampling: Optional[SamplingOptions] + current_data_path: Optional[str] + current_data_options: Optional[DataOptions] + current_data_sampling: Optional[SamplingOptions] column_mapping: Dict[str, str] output_path: str @@ -35,23 +23,18 @@ def __init__(self, options: RunnerOptions): self.options = options def _parse_data(self): - ref_parse_dates = [self.options.reference_data_options.date_column] \ - if self.options.reference_data_options.date_column \ - else False - reference_data = pd.read_csv(self.options.reference_data_path, - header=0 if self.options.reference_data_options.header else None, - sep=self.options.reference_data_options.separator, - parse_dates=ref_parse_dates) - - if self.options.production_data_path: - prod_parse_dates = [self.options.production_data_options.date_column] \ - if self.options.production_data_options.date_column \ - else False - production_data = pd.read_csv(self.options.production_data_path, - header=0 if self.options.production_data_options.header else None, - sep=self.options.production_data_options.separator, - parse_dates=prod_parse_dates) + loader = DataLoader() + + reference_data = loader.load(self.options.reference_data_path, + self.options.reference_data_options, + self.options.reference_data_sampling) + logging.info(f"reference dataset loaded: {len(reference_data)} rows") + if self.options.current_data_path: + current_data = loader.load(self.options.current_data_path, + self.options.current_data_options, + self.options.current_data_sampling) + logging.info(f"current dataset loaded: {len(current_data)} rows") else: - production_data = None + current_data = None - return reference_data, production_data + return reference_data, current_data diff --git a/evidently/tabs/base_tab.py b/evidently/tabs/base_tab.py index 168d457ae0..f8dc1db7a9 100644 --- a/evidently/tabs/base_tab.py +++ b/evidently/tabs/base_tab.py @@ -22,12 +22,12 @@ def analyzers(self) -> List[Type[Analyzer]]: return list(set([analyzer for widget in self.widgets for analyzer in widget.analyzers()])) def calculate(self, reference_data: pandas.DataFrame, - production_data: pandas.DataFrame, + current_data: pandas.DataFrame, column_mapping: Dict, analyzers_results: Dict): self.widgets = self._get_widgets() for widget in self.widgets: - widget.calculate(reference_data, production_data, column_mapping, analyzers_results) + widget.calculate(reference_data, current_data, column_mapping, analyzers_results) def info(self) -> List[BaseWidgetInfo]: return [w.get_info() for w in self.widgets] diff --git a/evidently/utils/__init__.py b/evidently/utils/__init__.py new file mode 100644 index 0000000000..64d6a69536 --- /dev/null +++ b/evidently/utils/__init__.py @@ -0,0 +1,22 @@ +import json +import numpy as np + + +_integer_types = (np.int_, np.intc, np.intp, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64) +_float_types = (np.float_, np.float16, np.float32, np.float64) + + +class NumpyEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, _integer_types): + return int(obj) + elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)): + return float(obj) + elif isinstance(obj, (np.ndarray,)): + return obj.tolist() + elif isinstance(obj, np.bool_): + return bool(obj) + elif isinstance(obj, np.void): + return None + + return json.JSONEncoder.default(self, obj) diff --git a/evidently/widgets/cat_prediction_drift_widget.py b/evidently/widgets/cat_prediction_drift_widget.py index b76c52b2d1..7d664de5e2 100644 --- a/evidently/widgets/cat_prediction_drift_widget.py +++ b/evidently/widgets/cat_prediction_drift_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No prediction data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -66,31 +66,31 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) #ref_feature_vc = reference_data[prediction_column][np.isfinite(reference_data[prediction_column])].value_counts() - #prod_feature_vc = production_data[prediction_column][np.isfinite(production_data[prediction_column])].value_counts() + #current_feature_vc = current_data[prediction_column][np.isfinite(current_data[prediction_column])].value_counts() #keys = set(list(reference_data[prediction_column][np.isfinite(reference_data[prediction_column])].unique()) + - # list(production_data[prediction_column][np.isfinite(production_data[prediction_column])].unique())) + # list(current_data[prediction_column][np.isfinite(current_data[prediction_column])].unique())) ref_feature_vc = reference_data[prediction_column].value_counts() - prod_feature_vc = production_data[prediction_column].value_counts() + current_feature_vc = current_data[prediction_column].value_counts() keys = set(list(reference_data[prediction_column].unique()) + - list(production_data[prediction_column].unique())) + list(current_data[prediction_column].unique())) ref_feature_dict = dict.fromkeys(keys, 0) for key, item in zip(ref_feature_vc.index, ref_feature_vc.values): ref_feature_dict[key] = item - prod_feature_dict = dict.fromkeys(keys, 0) - for key, item in zip(prod_feature_vc.index, prod_feature_vc.values): - prod_feature_dict[key] = item + current_feature_dict = dict.fromkeys(keys, 0) + for key, item in zip(current_feature_vc.index, current_feature_vc.values): + current_feature_dict[key] = item f_exp = [value[1] for value in sorted(ref_feature_dict.items())] - f_obs = [value[1] for value in sorted(prod_feature_dict.items())] + f_obs = [value[1] for value in sorted(current_feature_dict.items())] pred_p_value = chisquare(f_exp, f_obs)[1] @@ -102,7 +102,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, fig.add_trace(go.Histogram(x=reference_data[prediction_column], marker_color=grey, opacity=0.6, nbinsx=10, name='Reference', histnorm='probability')) - fig.add_trace(go.Histogram(x=production_data[prediction_column], + fig.add_trace(go.Histogram(x=current_data[prediction_column], marker_color=red, opacity=0.6,nbinsx=10, name='Current', histnorm='probability')) fig.update_layout( diff --git a/evidently/widgets/cat_target_drift_widget.py b/evidently/widgets/cat_target_drift_widget.py index f0632454ed..48ce1dcc00 100644 --- a/evidently/widgets/cat_target_drift_widget.py +++ b/evidently/widgets/cat_target_drift_widget.py @@ -29,7 +29,7 @@ def analyzers(self): def get_info(self) -> BaseWidgetInfo: return self.wi - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -62,32 +62,32 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) #calculate output drift #ref_feature_vc = reference_data[target_column][np.isfinite(reference_data[target_column])].value_counts() - #prod_feature_vc = production_data[target_column][np.isfinite(production_data[target_column])].value_counts() + #current_feature_vc = current_data[target_column][np.isfinite(current_data[target_column])].value_counts() #keys = set(list(reference_data[target_column][np.isfinite(reference_data[target_column])].unique()) + - # list(production_data[target_column][np.isfinite(production_data[target_column])].unique())) + # list(current_data[target_column][np.isfinite(current_data[target_column])].unique())) ref_feature_vc = reference_data[target_column].value_counts() - prod_feature_vc = production_data[target_column].value_counts() + current_feature_vc = current_data[target_column].value_counts() keys = set(list(reference_data[target_column].unique()) + - list(production_data[target_column].unique())) + list(current_data[target_column].unique())) ref_feature_dict = dict.fromkeys(keys, 0) for key, item in zip(ref_feature_vc.index, ref_feature_vc.values): ref_feature_dict[key] = item - prod_feature_dict = dict.fromkeys(keys, 0) - for key, item in zip(prod_feature_vc.index, prod_feature_vc.values): - prod_feature_dict[key] = item + current_feature_dict = dict.fromkeys(keys, 0) + for key, item in zip(current_feature_vc.index, current_feature_vc.values): + current_feature_dict[key] = item f_exp = [value[1] for value in sorted(ref_feature_dict.items())] - f_obs = [value[1] for value in sorted(prod_feature_dict.items())] + f_obs = [value[1] for value in sorted(current_feature_dict.items())] target_p_value = chisquare(f_exp, f_obs)[1] @@ -99,7 +99,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, fig.add_trace(go.Histogram(x=reference_data[target_column], marker_color=grey, opacity=0.6, nbinsx=10, name='Reference', histnorm='probability')) - fig.add_trace(go.Histogram(x=production_data[target_column], + fig.add_trace(go.Histogram(x=current_data[target_column], marker_color=red, opacity=0.6,nbinsx=10, name='Current', histnorm='probability')) fig.update_layout( diff --git a/evidently/widgets/cat_target_pred_feature_table_widget.py b/evidently/widgets/cat_target_pred_feature_table_widget.py index 0ea0aba804..4f9ce619e5 100644 --- a/evidently/widgets/cat_target_pred_feature_table_widget.py +++ b/evidently/widgets/cat_target_pred_feature_table_widget.py @@ -31,7 +31,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("neither target nor prediction data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -86,8 +86,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, #create target plot reference_data['dataset'] = 'Reference' - production_data['dataset'] = 'Current' - merged_data = pd.concat([reference_data, production_data]) + current_data['dataset'] = 'Current' + merged_data = pd.concat([reference_data, current_data]) target_fig = px.histogram(merged_data, x=feature_name, color=target_column, facet_col="dataset", category_orders={"dataset": ["Reference", "Current"]}) @@ -166,8 +166,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, #create target plot #TO DO%: out pf the cycle reference_data['dataset'] = 'Reference' - production_data['dataset'] = 'Current' - merged_data = pd.concat([reference_data, production_data]) + current_data['dataset'] = 'Current' + merged_data = pd.concat([reference_data, current_data]) target_fig = px.histogram(merged_data, x=feature_name, color=target_column, facet_col="dataset", category_orders={"dataset": ["Reference", "Current"]}) @@ -228,8 +228,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, #create target plot reference_data['dataset'] = 'Reference' - production_data['dataset'] = 'Current' - merged_data = pd.concat([reference_data, production_data]) + current_data['dataset'] = 'Current' + merged_data = pd.concat([reference_data, current_data]) prediction_fig = px.histogram(merged_data, x=feature_name, color=prediction_column, facet_col="dataset", category_orders={"dataset": ["Reference", "Current"]}) diff --git a/evidently/widgets/class_confusion_based_feature_distr_table_widget.py b/evidently/widgets/class_confusion_based_feature_distr_table_widget.py index 066d3532ce..c66646e207 100644 --- a/evidently/widgets/class_confusion_based_feature_distr_table_widget.py +++ b/evidently/widgets/class_confusion_based_feature_distr_table_widget.py @@ -31,7 +31,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("neither target nor prediction data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -64,7 +64,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if prediction_column is not None and target_column is not None: - if production_data is not None: + if current_data is not None: additional_graphs_data = [] params_data = [] @@ -85,8 +85,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, #create confusion based plots reference_data['dataset'] = 'Reference' - production_data['dataset'] = 'Current' - merged_data = pd.concat([reference_data, production_data]) + current_data['dataset'] = 'Current' + merged_data = pd.concat([reference_data, current_data]) fig = px.histogram(merged_data, x=feature_name, color=target_column, facet_col="dataset", histnorm = '', category_orders={"dataset": ["Reference", "Current"]}) diff --git a/evidently/widgets/class_prod_class_support_widget.py b/evidently/widgets/class_prod_class_support_widget.py index 72bbeaf50d..af889527bd 100644 --- a/evidently/widgets/class_prod_class_support_widget.py +++ b/evidently/widgets/class_prod_class_support_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -64,12 +64,12 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, target_names = None - if production_data is not None and target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + if current_data is not None and target_column is not None and prediction_column is not None: + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) #plot support bar - metrics_matrix = metrics.classification_report(production_data[target_column], production_data[prediction_column], + metrics_matrix = metrics.classification_report(current_data[target_column], current_data[prediction_column], output_dict=True) metrics_frame = pd.DataFrame(metrics_matrix) support = metrics_frame.iloc[-1:,:-3].values[0] diff --git a/evidently/widgets/class_prod_conf_matrix_widget.py b/evidently/widgets/class_prod_conf_matrix_widget.py index 85ec29914d..e016b1cb88 100644 --- a/evidently/widgets/class_prod_conf_matrix_widget.py +++ b/evidently/widgets/class_prod_conf_matrix_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -64,17 +64,17 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, target_names = None - if production_data is not None and target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + if current_data is not None and target_column is not None and prediction_column is not None: + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) #plot confusion matrix - conf_matrix = metrics.confusion_matrix(production_data[target_column], - production_data[prediction_column]) + conf_matrix = metrics.confusion_matrix(current_data[target_column], + current_data[prediction_column]) z = conf_matrix.astype(int) - labels = target_names if target_names else sorted(set(production_data[target_column])) + labels = target_names if target_names else sorted(set(current_data[target_column])) # change each element of z to type string for annotations z_text = [[str(y) for y in x] for x in z] diff --git a/evidently/widgets/class_prod_metrics_matrix_widget.py b/evidently/widgets/class_prod_metrics_matrix_widget.py index aa5dbec0ba..e809c0cc7f 100644 --- a/evidently/widgets/class_prod_metrics_matrix_widget.py +++ b/evidently/widgets/class_prod_metrics_matrix_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -64,12 +64,12 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, target_names = None - if production_data is not None and target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + if current_data is not None and target_column is not None and prediction_column is not None: + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) #plot support bar - metrics_matrix = metrics.classification_report(production_data[target_column], production_data[prediction_column], + metrics_matrix = metrics.classification_report(current_data[target_column], current_data[prediction_column], output_dict=True) metrics_frame = pd.DataFrame(metrics_matrix) diff --git a/evidently/widgets/class_prod_quality_metrics_widget.py b/evidently/widgets/class_prod_quality_metrics_widget.py index 42e97890ed..ecc8f03c7e 100644 --- a/evidently/widgets/class_prod_quality_metrics_widget.py +++ b/evidently/widgets/class_prod_quality_metrics_widget.py @@ -30,7 +30,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No reference data with target and prediction provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -59,18 +59,18 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) - if production_data is not None: + if current_data is not None: if target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) #calculate quality metrics - accuracy_score = metrics.accuracy_score(production_data[target_column], production_data[prediction_column]) - avg_precision = metrics.precision_score(production_data[target_column], production_data[prediction_column], + accuracy_score = metrics.accuracy_score(current_data[target_column], current_data[prediction_column]) + avg_precision = metrics.precision_score(current_data[target_column], current_data[prediction_column], average='macro') - avg_recall = metrics.recall_score(production_data[target_column], production_data[prediction_column], + avg_recall = metrics.recall_score(current_data[target_column], current_data[prediction_column], average='macro') - avg_f1 = metrics.f1_score(production_data[target_column], production_data[prediction_column], + avg_f1 = metrics.f1_score(current_data[target_column], current_data[prediction_column], average='macro') self.wi = BaseWidgetInfo( diff --git a/evidently/widgets/class_ref_class_support_widget.py b/evidently/widgets/class_ref_class_support_widget.py index 19504cc85e..2150368406 100644 --- a/evidently/widgets/class_ref_class_support_widget.py +++ b/evidently/widgets/class_ref_class_support_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -94,7 +94,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, alerts=[], alertsPosition="row", insights=[], - size=1 if production_data is not None else 2, + size=1 if current_data is not None else 2, params={ "data": support_bar_json['data'], "layout": support_bar_json['layout'] diff --git a/evidently/widgets/class_ref_conf_matrix_widget.py b/evidently/widgets/class_ref_conf_matrix_widget.py index 999a452aac..0d02edfc91 100644 --- a/evidently/widgets/class_ref_conf_matrix_widget.py +++ b/evidently/widgets/class_ref_conf_matrix_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -96,7 +96,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, alerts=[], alertsPosition="row", insights=[], - size=1 if production_data is not None else 2, + size=1 if current_data is not None else 2, params={ "data": conf_matrix_json['data'], "layout": conf_matrix_json['layout'] diff --git a/evidently/widgets/class_ref_metrics_matrix_widget.py b/evidently/widgets/class_ref_metrics_matrix_widget.py index b711f23dab..cf59d8f88c 100644 --- a/evidently/widgets/class_ref_metrics_matrix_widget.py +++ b/evidently/widgets/class_ref_metrics_matrix_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -98,7 +98,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, alerts=[], alertsPosition="row", insights=[], - size=1 if production_data is not None else 2, + size=1 if current_data is not None else 2, params={ "data": metrics_matrix_json['data'], "layout": metrics_matrix_json['layout'] diff --git a/evidently/widgets/class_ref_quality_metrics_widget.py b/evidently/widgets/class_ref_quality_metrics_widget.py index 41b9de1f95..5bd0ccf7ae 100644 --- a/evidently/widgets/class_ref_quality_metrics_widget.py +++ b/evidently/widgets/class_ref_quality_metrics_widget.py @@ -31,7 +31,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No reference data with target and prediction provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') diff --git a/evidently/widgets/class_target_name_widget.py b/evidently/widgets/class_target_name_widget.py index 90e3f70a2f..2f412ed0b6 100644 --- a/evidently/widgets/class_target_name_widget.py +++ b/evidently/widgets/class_target_name_widget.py @@ -33,7 +33,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No reference data with target and prediction provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') diff --git a/evidently/widgets/counter_widget.py b/evidently/widgets/counter_widget.py index efecf10aa2..be5295c1ea 100644 --- a/evidently/widgets/counter_widget.py +++ b/evidently/widgets/counter_widget.py @@ -22,7 +22,7 @@ def analyzers(self): def calculate(self, reference_data: pandas.DataFrame, - production_data: pandas.DataFrame, + current_data: pandas.DataFrame, column_mapping: Dict, analyzes_results): self.wi = BaseWidgetInfo( diff --git a/evidently/widgets/data_drift_table_widget.py b/evidently/widgets/data_drift_table_widget.py index 98147f80c1..98bb6922ef 100644 --- a/evidently/widgets/data_drift_table_widget.py +++ b/evidently/widgets/data_drift_table_widget.py @@ -30,7 +30,7 @@ def get_info(self) -> BaseWidgetInfo: def calculate(self, reference_data: pd.DataFrame, - production_data: pd.DataFrame, + current_data: pd.DataFrame, column_mapping, analyzers_results): results = analyzers_results[DataDriftAnalyzer] @@ -44,7 +44,7 @@ def calculate(self, date_column = results['utility_columns']['date'] for feature_name in num_feature_names: - prod_small_hist = results['metrics'][feature_name]["prod_small_hist"] + current_small_hist = results['metrics'][feature_name]["current_small_hist"] ref_small_hist = results['metrics'][feature_name]["ref_small_hist"] feature_type = results['metrics'][feature_name]["feature_type"] @@ -76,8 +76,8 @@ def calculate(self, "y": list(ref_small_hist[0]) }, "f4": { - "x": list(prod_small_hist[1]), - "y": list(prod_small_hist[0]) + "x": list(current_small_hist[1]), + "y": list(current_small_hist[0]) }, "f2": distr_sim_test, "f5": round(p_value, 6) @@ -85,7 +85,7 @@ def calculate(self, ) for feature_name in cat_feature_names: - prod_small_hist = results['metrics'][feature_name]["prod_small_hist"] + current_small_hist = results['metrics'][feature_name]["current_small_hist"] ref_small_hist = results['metrics'][feature_name]["ref_small_hist"] feature_type = results['metrics'][feature_name]["feature_type"] @@ -118,8 +118,8 @@ def calculate(self, "y": list(ref_small_hist[0]) }, "f4": { - "x": list(prod_small_hist[1]), - "y": list(prod_small_hist[0]) + "x": list(current_small_hist[1]), + "y": list(current_small_hist[0]) }, "f2": distr_sim_test, "f5": round(p_value, 6) @@ -135,7 +135,7 @@ def calculate(self, marker_color=grey, opacity=0.6, nbinsx=10, name='Reference', histnorm='probability')) - fig.add_trace(go.Histogram(x=production_data[feature_name], + fig.add_trace(go.Histogram(x=current_data[feature_name], marker_color=red, opacity=0.6, nbinsx=10, name='Current', histnorm='probability')) @@ -161,8 +161,8 @@ def calculate(self, fig = go.Figure() fig.add_trace(go.Scatter( - x=production_data[date_column] if date_column else production_data.index, - y=production_data[feature_name], + x=current_data[date_column] if date_column else current_data.index, + y=current_data[feature_name], mode='markers', name='Current', marker=dict( diff --git a/evidently/widgets/num_prediction_corr_widget.py b/evidently/widgets/num_prediction_corr_widget.py index 2726f90155..df3e52ac61 100644 --- a/evidently/widgets/num_prediction_corr_widget.py +++ b/evidently/widgets/num_prediction_corr_widget.py @@ -31,7 +31,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No prediction data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -64,7 +64,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, #calculate corr ref_pred_corr = reference_data[num_feature_names + [prediction_column]].corr()[prediction_column] - prod_pred_corr = production_data[num_feature_names + [prediction_column]].corr()[prediction_column] + current_pred_corr = current_data[num_feature_names + [prediction_column]].corr()[prediction_column] #plot output correlations pred_corr = go.Figure() @@ -72,7 +72,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, pred_corr.add_trace(go.Bar(y = ref_pred_corr, x = ref_pred_corr.index, marker_color = grey, name = 'Reference')) - pred_corr.add_trace(go.Bar(y = prod_pred_corr, x = ref_pred_corr.index, + pred_corr.add_trace(go.Bar(y = current_pred_corr, x = ref_pred_corr.index, marker_color = red, name = 'Current')) pred_corr.update_layout(xaxis_title = "Features", yaxis_title = "Correlation", diff --git a/evidently/widgets/num_prediction_drift_widget.py b/evidently/widgets/num_prediction_drift_widget.py index 08440b1a55..1347e081cc 100644 --- a/evidently/widgets/num_prediction_drift_widget.py +++ b/evidently/widgets/num_prediction_drift_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No prediction data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -63,12 +63,12 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, if prediction_column is not None: #calculate output drift - pred_p_value = ks_2samp(reference_data[prediction_column], production_data[prediction_column])[1] + pred_p_value = ks_2samp(reference_data[prediction_column], current_data[prediction_column])[1] pred_sim_test = "detected" if pred_p_value < 0.05 else "not detected" #plot output distributions pred_distr = ff.create_distplot( - [reference_data[prediction_column], production_data[prediction_column]], + [reference_data[prediction_column], current_data[prediction_column]], ["Reference", "Current"], colors=[grey, red], show_rug=True) diff --git a/evidently/widgets/num_prediction_values_widget.py b/evidently/widgets/num_prediction_values_widget.py index 5e061c2752..27663c1c94 100644 --- a/evidently/widgets/num_prediction_values_widget.py +++ b/evidently/widgets/num_prediction_values_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No prediction data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -81,8 +81,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, )) pred_values.add_trace(go.Scatter( - x = production_data[date_column] if date_column else production_data.index, - y = production_data[prediction_column], + x = current_data[date_column] if date_column else current_data.index, + y = current_data[prediction_column], mode = 'markers', name = 'Current', marker = dict( diff --git a/evidently/widgets/num_target_corr_widget.py b/evidently/widgets/num_target_corr_widget.py index c583cb64ee..5deb3d5246 100644 --- a/evidently/widgets/num_target_corr_widget.py +++ b/evidently/widgets/num_target_corr_widget.py @@ -31,7 +31,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No prediction data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -64,7 +64,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, #calculate corr ref_target_corr = reference_data[num_feature_names + [target_column]].corr()[target_column] - prod_target_corr = production_data[num_feature_names + [target_column]].corr()[target_column] + current_target_corr = current_data[num_feature_names + [target_column]].corr()[target_column] #plot output correlations target_corr = go.Figure() @@ -72,7 +72,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, target_corr.add_trace(go.Bar(y = ref_target_corr, x = ref_target_corr.index, marker_color = grey, name = 'Reference')) - target_corr.add_trace(go.Bar(y = prod_target_corr, x = ref_target_corr.index, + target_corr.add_trace(go.Bar(y = current_target_corr, x = ref_target_corr.index, marker_color = red, name = 'Current')) target_corr.update_layout(xaxis_title = "Features", yaxis_title = "Correlation", diff --git a/evidently/widgets/num_target_drift_widget.py b/evidently/widgets/num_target_drift_widget.py index 1bc59b3db7..5717b011b4 100644 --- a/evidently/widgets/num_target_drift_widget.py +++ b/evidently/widgets/num_target_drift_widget.py @@ -29,7 +29,7 @@ def analyzers(self): def get_info(self) -> BaseWidgetInfo: return self.wi - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -60,12 +60,12 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, if target_column is not None: #calculate output drift - target_p_value = ks_2samp(reference_data[target_column], production_data[target_column])[1] + target_p_value = ks_2samp(reference_data[target_column], current_data[target_column])[1] target_sim_test = "detected" if target_p_value < 0.05 else "not detected" #plot output distributions target_distr = ff.create_distplot( - [reference_data[target_column], production_data[target_column]], + [reference_data[target_column], current_data[target_column]], ["Reference", "Current"], colors=[grey, red], show_rug=True) diff --git a/evidently/widgets/num_target_pred_feature_table_widget.py b/evidently/widgets/num_target_pred_feature_table_widget.py index b4a58a28f9..b071f1276c 100644 --- a/evidently/widgets/num_target_pred_feature_table_widget.py +++ b/evidently/widgets/num_target_pred_feature_table_widget.py @@ -31,7 +31,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("neither target nor prediction data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -116,8 +116,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, if prediction_column is not None: fig.add_trace( go.Scatter( - x = production_data[feature_name], - y = production_data[prediction_column], + x = current_data[feature_name], + y = current_data[prediction_column], mode = 'markers', name = 'Prediction (curr)', marker = dict( @@ -131,8 +131,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, if target_column is not None: fig.add_trace( go.Scatter( - x = production_data[feature_name], - y = production_data[target_column], + x = current_data[feature_name], + y = current_data[target_column], mode = 'markers', name = 'Target (curr)', marker = dict( diff --git a/evidently/widgets/num_target_values_widget.py b/evidently/widgets/num_target_values_widget.py index 5d990f4195..89cc763d3e 100644 --- a/evidently/widgets/num_target_values_widget.py +++ b/evidently/widgets/num_target_values_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No prediction data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -81,8 +81,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, )) target_values.add_trace(go.Scatter( - x = production_data[date_column] if date_column else production_data.index, - y = production_data[target_column], + x = current_data[date_column] if date_column else current_data.index, + y = current_data[target_column], mode = 'markers', name = 'Current', marker = dict( diff --git a/evidently/widgets/prob_class_confusion_based_feature_distr_table_widget.py b/evidently/widgets/prob_class_confusion_based_feature_distr_table_widget.py index c617f2fc52..87070d698e 100644 --- a/evidently/widgets/prob_class_confusion_based_feature_distr_table_widget.py +++ b/evidently/widgets/prob_class_confusion_based_feature_distr_table_widget.py @@ -33,7 +33,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("neither target nor prediction data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -69,16 +69,16 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(reference_data[target_column]) binaraized_target = binaraizer.transform(reference_data[target_column]) - if production_data is not None: + if current_data is not None: ref_array_prediction = reference_data[prediction_column].to_numpy() ref_prediction_ids = np.argmax(ref_array_prediction, axis=-1) ref_prediction_labels = [prediction_column[x] for x in ref_prediction_ids] reference_data['prediction_labels'] = ref_prediction_labels - prod_array_prediction = production_data[prediction_column].to_numpy() - prod_prediction_ids = np.argmax(prod_array_prediction, axis=-1) - prod_prediction_labels = [prediction_column[x] for x in prod_prediction_ids] - production_data['prediction_labels'] = prod_prediction_labels + current_array_prediction = current_data[prediction_column].to_numpy() + current_prediction_ids = np.argmax(current_array_prediction, axis=-1) + current_prediction_labels = [prediction_column[x] for x in current_prediction_ids] + current_data['prediction_labels'] = current_prediction_labels additional_graphs_data = [] params_data = [] @@ -99,8 +99,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, #create confusion based plots reference_data['dataset'] = 'Reference' - production_data['dataset'] = 'Current' - merged_data = pd.concat([reference_data, production_data]) + current_data['dataset'] = 'Current' + merged_data = pd.concat([reference_data, current_data]) fig = px.histogram(merged_data, x=feature_name, color=target_column, facet_col="dataset", histnorm = '', category_orders={"dataset": ["Reference", "Current"]}) @@ -156,15 +156,15 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, showticklabels=True ), yaxis = dict( - range=(0, 1), + range=(-0.1, 1.1), showticklabels=True ) ) - #PROD Prediction + #current Prediction fig.add_trace(go.Scatter( - x = production_data[production_data[target_column] == label][feature_name], - y = production_data[production_data[target_column] == label][label], + x = current_data[current_data[target_column] == label][feature_name], + y = current_data[current_data[target_column] == label][label], mode = 'markers', name = str(label) + ' (curr)', marker=dict( @@ -176,8 +176,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, ) fig.add_trace(go.Scatter( - x = production_data[production_data[target_column] != label][feature_name], - y = production_data[production_data[target_column] != label][label], + x = current_data[current_data[target_column] != label][feature_name], + y = current_data[current_data[target_column] != label][label], mode = 'markers', name = 'other (curr)', marker=dict( @@ -195,7 +195,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, showticklabels=True ), yaxis = dict( - range=(0, 1), + range=(-0.1, 1.1), showticklabels=True ) ) @@ -316,7 +316,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, showticklabels=True ), yaxis = dict( - range=(0, 1), + range=(-0.1, 1.1), showticklabels=True ) ) diff --git a/evidently/widgets/prob_class_prod_class_support_widget.py b/evidently/widgets/prob_class_prod_class_support_widget.py index ee694555e4..524c7e3533 100644 --- a/evidently/widgets/prob_class_prod_class_support_widget.py +++ b/evidently/widgets/prob_class_prod_class_support_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -64,17 +64,17 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, target_names = None - if production_data is not None and target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + if current_data is not None and target_column is not None and prediction_column is not None: + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) - array_prediction = production_data[prediction_column].to_numpy() + array_prediction = current_data[prediction_column].to_numpy() prediction_ids = np.argmax(array_prediction, axis=-1) prediction_labels = [prediction_column[x] for x in prediction_ids] #plot support bar - metrics_matrix = metrics.classification_report(production_data[target_column], prediction_labels, + metrics_matrix = metrics.classification_report(current_data[target_column], prediction_labels, output_dict=True) metrics_frame = pd.DataFrame(metrics_matrix) support = metrics_frame.iloc[-1:,:-3].values[0] diff --git a/evidently/widgets/prob_class_prod_conf_matrix_widget.py b/evidently/widgets/prob_class_prod_conf_matrix_widget.py index 18ab9e5a8a..0f84a346db 100644 --- a/evidently/widgets/prob_class_prod_conf_matrix_widget.py +++ b/evidently/widgets/prob_class_prod_conf_matrix_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -64,22 +64,22 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, target_names = None - if production_data is not None and target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + if current_data is not None and target_column is not None and prediction_column is not None: + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) - array_prediction = production_data[prediction_column].to_numpy() + array_prediction = current_data[prediction_column].to_numpy() prediction_ids = np.argmax(array_prediction, axis=-1) prediction_labels = [prediction_column[x] for x in prediction_ids] #plot confusion matrix - conf_matrix = metrics.confusion_matrix(production_data[target_column], + conf_matrix = metrics.confusion_matrix(current_data[target_column], prediction_labels) z = conf_matrix.astype(int) - labels = sorted(set(production_data[target_column])) + labels = sorted(set(current_data[target_column])) # change each element of z to type string for annotations z_text = [[str(y) for y in x] for x in z] diff --git a/evidently/widgets/prob_class_prod_metrics_matrix_widget.py b/evidently/widgets/prob_class_prod_metrics_matrix_widget.py index 36431b2765..c05010e7b0 100644 --- a/evidently/widgets/prob_class_prod_metrics_matrix_widget.py +++ b/evidently/widgets/prob_class_prod_metrics_matrix_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -64,23 +64,23 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, target_names = None - if production_data is not None and target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + if current_data is not None and target_column is not None and prediction_column is not None: + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(reference_data[target_column]) - binaraized_target = binaraizer.transform(production_data[target_column]) + binaraized_target = binaraizer.transform(current_data[target_column]) - array_prediction = production_data[prediction_column].to_numpy() + array_prediction = current_data[prediction_column].to_numpy() prediction_ids = np.argmax(array_prediction, axis=-1) prediction_labels = [prediction_column[x] for x in prediction_ids] - labels = sorted(set(production_data[target_column])) + labels = sorted(set(current_data[target_column])) #plot support bar - metrics_matrix = metrics.classification_report(production_data[target_column], prediction_labels, + metrics_matrix = metrics.classification_report(current_data[target_column], prediction_labels, output_dict=True) metrics_frame = pd.DataFrame(metrics_matrix) diff --git a/evidently/widgets/prob_class_prod_pr_curve_widget.py b/evidently/widgets/prob_class_prod_pr_curve_widget.py index 688fa315b8..a678ab2d43 100644 --- a/evidently/widgets/prob_class_prod_pr_curve_widget.py +++ b/evidently/widgets/prob_class_prod_pr_curve_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -64,9 +64,9 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, #target_names = None - if production_data is not None and target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + if current_data is not None and target_column is not None and prediction_column is not None: + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) #array_prediction = reference_data[prediction_column].to_numpy() @@ -74,11 +74,11 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, #prediction_labels = [prediction_column[x] for x in prediction_ids] if len(prediction_column) <= 2: binaraizer = preprocessing.LabelBinarizer() - binaraizer.fit(production_data[target_column]) - binaraized_target = pd.DataFrame(binaraizer.transform(production_data[target_column])) + binaraizer.fit(current_data[target_column]) + binaraized_target = pd.DataFrame(binaraizer.transform(current_data[target_column])) binaraized_target.columns = ['target'] - p, r, thrs = metrics.precision_recall_curve(binaraized_target, production_data[prediction_column[0]]) #problem!!! + p, r, thrs = metrics.precision_recall_curve(binaraized_target, current_data[prediction_column[0]]) #problem!!! fig = go.Figure() fig.add_trace(go.Scatter( @@ -118,14 +118,14 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, else: binaraizer = preprocessing.LabelBinarizer() - binaraizer.fit(production_data[target_column]) - binaraized_target = pd.DataFrame(binaraizer.transform(production_data[target_column])) + binaraizer.fit(current_data[target_column]) + binaraized_target = pd.DataFrame(binaraizer.transform(current_data[target_column])) binaraized_target.columns = prediction_column #plot support bar graphs = [] for label in prediction_column: - p, r, thrs = metrics.precision_recall_curve(binaraized_target[label], production_data[label]) + p, r, thrs = metrics.precision_recall_curve(binaraized_target[label], current_data[label]) fig = go.Figure() fig.add_trace(go.Scatter( diff --git a/evidently/widgets/prob_class_prod_pr_table_widget.py b/evidently/widgets/prob_class_prod_pr_table_widget.py index 612ee4777c..3400f738ba 100644 --- a/evidently/widgets/prob_class_prod_pr_table_widget.py +++ b/evidently/widgets/prob_class_prod_pr_table_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -64,9 +64,9 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, #target_names = None - if production_data is not None and target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + if current_data is not None and target_column is not None and prediction_column is not None: + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) #array_prediction = reference_data[prediction_column].to_numpy() @@ -75,14 +75,14 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, if len(prediction_column) <= 2: binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(reference_data[target_column]) - binaraized_target = pd.DataFrame(binaraizer.transform(production_data[target_column])) + binaraized_target = pd.DataFrame(binaraizer.transform(current_data[target_column])) binaraized_target.columns = ['target'] params_data = [] step_size = 0.05 binded = list(zip(binaraized_target['target'].tolist(), - production_data[prediction_column[0]].tolist())) + current_data[prediction_column[0]].tolist())) binded.sort(key = lambda item: item[1], reverse = True) data_size = len(binded) @@ -157,7 +157,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, else: binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(reference_data[target_column]) - binaraized_target = pd.DataFrame(binaraizer.transform(production_data[target_column])) + binaraized_target = pd.DataFrame(binaraizer.transform(current_data[target_column])) binaraized_target.columns = prediction_column #create tables @@ -168,7 +168,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, step_size = 0.05 binded = list(zip(binaraized_target[label].tolist(), - production_data[label].tolist())) + current_data[label].tolist())) binded.sort(key = lambda item: item[1], reverse = True) data_size = len(binded) diff --git a/evidently/widgets/prob_class_prod_pred_distr_widget.py b/evidently/widgets/prob_class_prod_pred_distr_widget.py index 48e34c873c..0884c93d4d 100644 --- a/evidently/widgets/prob_class_prod_pred_distr_widget.py +++ b/evidently/widgets/prob_class_prod_pred_distr_widget.py @@ -33,7 +33,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzers_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzers_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -65,11 +65,11 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, #target_names = None - if production_data is not None and target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + if current_data is not None and target_column is not None and prediction_column is not None: + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) - array_prediction = production_data[prediction_column].to_numpy() + array_prediction = current_data[prediction_column].to_numpy() prediction_ids = np.argmax(array_prediction, axis=-1) prediction_labels = [prediction_column[x] for x in prediction_ids] @@ -80,8 +80,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, for label in prediction_column: pred_distr = ff.create_distplot( [ - production_data[production_data[target_column] == label][label], - production_data[production_data[target_column] != label][label] + current_data[current_data[target_column] == label][label], + current_data[current_data[target_column] != label][label] ], [str(label), "other"], colors=[red, grey], diff --git a/evidently/widgets/prob_class_prod_prediction_cloud_widget.py b/evidently/widgets/prob_class_prod_prediction_cloud_widget.py index 0b905b5ce4..9763921b24 100644 --- a/evidently/widgets/prob_class_prod_prediction_cloud_widget.py +++ b/evidently/widgets/prob_class_prod_prediction_cloud_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -64,11 +64,11 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, #target_names = None - if production_data is not None and target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + if current_data is not None and target_column is not None and prediction_column is not None: + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) - array_prediction = production_data[prediction_column].to_numpy() + array_prediction = current_data[prediction_column].to_numpy() prediction_ids = np.argmax(array_prediction, axis=-1) prediction_labels = [prediction_column[x] for x in prediction_ids] @@ -80,8 +80,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, fig = go.Figure() fig.add_trace(go.Scatter( - x = np.random.random(production_data[production_data[target_column] == label].shape[0]), - y = production_data[production_data[target_column] == label][label], + x = np.random.random(current_data[current_data[target_column] == label].shape[0]), + y = current_data[current_data[target_column] == label][label], mode = 'markers', name = str(label), marker=dict( @@ -91,8 +91,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, )) fig.add_trace(go.Scatter( - x = np.random.random(production_data[production_data[target_column] != label].shape[0]), - y = production_data[production_data[target_column] != label][label], + x = np.random.random(current_data[current_data[target_column] != label].shape[0]), + y = current_data[current_data[target_column] != label][label], mode = 'markers', name = 'other', marker=dict( diff --git a/evidently/widgets/prob_class_prod_quality_metrics_widget.py b/evidently/widgets/prob_class_prod_quality_metrics_widget.py index 9d93b92d30..7df0098f1f 100644 --- a/evidently/widgets/prob_class_prod_quality_metrics_widget.py +++ b/evidently/widgets/prob_class_prod_quality_metrics_widget.py @@ -30,7 +30,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No reference data with target and prediction provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -59,16 +59,16 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) - if production_data is not None: + if current_data is not None: if target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(reference_data[target_column]) - binaraized_target = binaraizer.transform(production_data[target_column]) + binaraized_target = binaraizer.transform(current_data[target_column]) - array_prediction = production_data[prediction_column].to_numpy() + array_prediction = current_data[prediction_column].to_numpy() prediction_ids = np.argmax(array_prediction, axis=-1) prediction_labels = [prediction_column[x] for x in prediction_ids] @@ -78,15 +78,15 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, roc_auc = metrics.roc_auc_score(binaraized_target, array_prediction, average='macro') log_loss = metrics.log_loss(binaraized_target, array_prediction) else: - roc_auc = metrics.roc_auc_score(binaraized_target, production_data[prediction_column[0]]) #problem!!! - log_loss = metrics.log_loss(binaraized_target, production_data[prediction_column[0]]) #problem!!! + roc_auc = metrics.roc_auc_score(binaraized_target, current_data[prediction_column[0]]) #problem!!! + log_loss = metrics.log_loss(binaraized_target, current_data[prediction_column[0]]) #problem!!! - accuracy_score = metrics.accuracy_score(production_data[target_column], prediction_labels) - avg_precision = metrics.precision_score(production_data[target_column], prediction_labels, + accuracy_score = metrics.accuracy_score(current_data[target_column], prediction_labels) + avg_precision = metrics.precision_score(current_data[target_column], prediction_labels, average='macro') - avg_recall = metrics.recall_score(production_data[target_column], prediction_labels, + avg_recall = metrics.recall_score(current_data[target_column], prediction_labels, average='macro') - avg_f1 = metrics.f1_score(production_data[target_column], prediction_labels, + avg_f1 = metrics.f1_score(current_data[target_column], prediction_labels, average='macro') self.wi = BaseWidgetInfo( diff --git a/evidently/widgets/prob_class_prod_roc_curve_widget.py b/evidently/widgets/prob_class_prod_roc_curve_widget.py index 2907e8e4fa..6bba479592 100644 --- a/evidently/widgets/prob_class_prod_roc_curve_widget.py +++ b/evidently/widgets/prob_class_prod_roc_curve_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -64,17 +64,17 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, #target_names = None - if production_data is not None and target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + if current_data is not None and target_column is not None and prediction_column is not None: + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) if len(prediction_column) <= 2: binaraizer = preprocessing.LabelBinarizer() - binaraizer.fit(production_data[target_column]) - binaraized_target = pd.DataFrame(binaraizer.transform(production_data[target_column])) + binaraizer.fit(current_data[target_column]) + binaraized_target = pd.DataFrame(binaraizer.transform(current_data[target_column])) binaraized_target.columns = ['target'] - fpr, tpr, thrs = metrics.roc_curve(binaraized_target, production_data[prediction_column[0]]) #problem!!! + fpr, tpr, thrs = metrics.roc_curve(binaraized_target, current_data[prediction_column[0]]) #problem!!! fig = go.Figure() fig.add_trace(go.Scatter( @@ -114,14 +114,14 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, else: binaraizer = preprocessing.LabelBinarizer() - binaraizer.fit(production_data[target_column]) - binaraized_target = pd.DataFrame(binaraizer.transform(production_data[target_column])) + binaraizer.fit(current_data[target_column]) + binaraized_target = pd.DataFrame(binaraizer.transform(current_data[target_column])) binaraized_target.columns = prediction_column #plot support bar graphs = [] for label in prediction_column: - fpr, tpr, thrs = metrics.roc_curve(binaraized_target[label], production_data[label]) + fpr, tpr, thrs = metrics.roc_curve(binaraized_target[label], current_data[label]) fig = go.Figure() fig.add_trace(go.Scatter( diff --git a/evidently/widgets/prob_class_ref_class_support_widget.py b/evidently/widgets/prob_class_ref_class_support_widget.py index d90a9b3a32..c4f41a1441 100644 --- a/evidently/widgets/prob_class_ref_class_support_widget.py +++ b/evidently/widgets/prob_class_ref_class_support_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -103,7 +103,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, alerts=[], alertsPosition="row", insights=[], - size=1 if production_data is not None else 2, + size=1 if current_data is not None else 2, params={ "data": support_bar_json['data'], "layout": support_bar_json['layout'] diff --git a/evidently/widgets/prob_class_ref_conf_matrix_widget.py b/evidently/widgets/prob_class_ref_conf_matrix_widget.py index 228645a66e..a44e745a7f 100644 --- a/evidently/widgets/prob_class_ref_conf_matrix_widget.py +++ b/evidently/widgets/prob_class_ref_conf_matrix_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -101,7 +101,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, alerts=[], alertsPosition="row", insights=[], - size=1 if production_data is not None else 2, + size=1 if current_data is not None else 2, params={ "data": conf_matrix_json['data'], "layout": conf_matrix_json['layout'] diff --git a/evidently/widgets/prob_class_ref_metrics_matrix_widget.py b/evidently/widgets/prob_class_ref_metrics_matrix_widget.py index 6f424f5817..802b0fcd19 100644 --- a/evidently/widgets/prob_class_ref_metrics_matrix_widget.py +++ b/evidently/widgets/prob_class_ref_metrics_matrix_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -112,7 +112,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, alerts=[], alertsPosition="row", insights=[], - size=1 if production_data is not None else 2, + size=1 if current_data is not None else 2, params={ "data": metrics_matrix_json['data'], "layout": metrics_matrix_json['layout'] diff --git a/evidently/widgets/prob_class_ref_pr_curve_widget.py b/evidently/widgets/prob_class_ref_pr_curve_widget.py index fcc7f002b0..bcfa2cee7a 100644 --- a/evidently/widgets/prob_class_ref_pr_curve_widget.py +++ b/evidently/widgets/prob_class_ref_pr_curve_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -108,7 +108,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, alerts=[], alertsPosition="row", insights=[], - size=1 if production_data is not None else 2, + size=1 if current_data is not None else 2, params={ "data": fig_json['data'], "layout": fig_json['layout'] @@ -163,7 +163,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, alerts=[], alertsPosition="row", insights=[], - size=1 if production_data is not None else 2, + size=1 if current_data is not None else 2, params={ "graphs": graphs }, diff --git a/evidently/widgets/prob_class_ref_pr_table_widget.py b/evidently/widgets/prob_class_ref_pr_table_widget.py index cc81b74bb3..53c70f800b 100644 --- a/evidently/widgets/prob_class_ref_pr_table_widget.py +++ b/evidently/widgets/prob_class_ref_pr_table_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -116,7 +116,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, alerts=[], alertsPosition="row", insights=[], - size=1 if production_data is not None else 2, + size=1 if current_data is not None else 2, params={ "rowsPerPage" : 21, "columns": [ @@ -206,7 +206,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, alerts=[], alertsPosition="row", insights=[], - size=2, #if production_data is not None else 2, + size=2, #if current_data is not None else 2, params={ "rowsPerPage": 21, "columns": [ @@ -249,7 +249,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, self.wi = BaseWidgetInfo( type="tabs", title=self.title, - size=1 if production_data is not None else 2, + size=1 if current_data is not None else 2, details="", tabs=tabs ) diff --git a/evidently/widgets/prob_class_ref_pred_distr_widget.py b/evidently/widgets/prob_class_ref_pred_distr_widget.py index a768c5f4f3..6b4a3fe455 100644 --- a/evidently/widgets/prob_class_ref_pred_distr_widget.py +++ b/evidently/widgets/prob_class_ref_pred_distr_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzers_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzers_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -121,7 +121,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, alerts=[], alertsPosition="row", insights=[], - size=1 if production_data is not None else 2, + size=1 if current_data is not None else 2, params={ "graphs": graphs }, diff --git a/evidently/widgets/prob_class_ref_prediction_cloud_widget.py b/evidently/widgets/prob_class_ref_prediction_cloud_widget.py index 11ed5414ba..5f8bfe609d 100644 --- a/evidently/widgets/prob_class_ref_prediction_cloud_widget.py +++ b/evidently/widgets/prob_class_ref_prediction_cloud_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -128,7 +128,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, alerts=[], alertsPosition="row", insights=[], - size=1 if production_data is not None else 2, + size=1 if current_data is not None else 2, params={ "graphs": graphs }, diff --git a/evidently/widgets/prob_class_ref_quality_metrics_widget.py b/evidently/widgets/prob_class_ref_quality_metrics_widget.py index 72a30fa1b0..2e91cbd455 100644 --- a/evidently/widgets/prob_class_ref_quality_metrics_widget.py +++ b/evidently/widgets/prob_class_ref_quality_metrics_widget.py @@ -31,7 +31,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No reference data with target and prediction provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') diff --git a/evidently/widgets/prob_class_ref_roc_curve_widget.py b/evidently/widgets/prob_class_ref_roc_curve_widget.py index 8a3fc3a554..5b2eb965cf 100644 --- a/evidently/widgets/prob_class_ref_roc_curve_widget.py +++ b/evidently/widgets/prob_class_ref_roc_curve_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No prediction or target data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -108,7 +108,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, alerts=[], alertsPosition="row", insights=[], - size=1 if production_data is not None else 2, + size=1 if current_data is not None else 2, params={ "data": fig_json['data'], "layout": fig_json['layout'] @@ -164,7 +164,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, alerts=[], alertsPosition="row", insights=[], - size=1 if production_data is not None else 2, + size=1 if current_data is not None else 2, params={ "graphs": graphs }, diff --git a/evidently/widgets/prob_class_target_name_widget.py b/evidently/widgets/prob_class_target_name_widget.py index 55ce2a54b8..43f73cdb76 100644 --- a/evidently/widgets/prob_class_target_name_widget.py +++ b/evidently/widgets/prob_class_target_name_widget.py @@ -33,7 +33,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No reference data with target and prediction provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') diff --git a/evidently/widgets/raw_widget.py b/evidently/widgets/raw_widget.py index e6a715b1f1..b4ed49eeca 100644 --- a/evidently/widgets/raw_widget.py +++ b/evidently/widgets/raw_widget.py @@ -11,7 +11,7 @@ class RawWidget(Widget): @abc.abstractmethod - def calculate(self, reference_data: pandas.DataFrame, production_data: pandas.DataFrame, column_mapping, + def calculate(self, reference_data: pandas.DataFrame, current_data: pandas.DataFrame, column_mapping, analyzes_results): raise NotImplemented() diff --git a/evidently/widgets/reg_prod_abs_perc_error_in_time_widget.py b/evidently/widgets/reg_prod_abs_perc_error_in_time_widget.py index a30c3dc5f0..4d895808df 100644 --- a/evidently/widgets/reg_prod_abs_perc_error_in_time_widget.py +++ b/evidently/widgets/reg_prod_abs_perc_error_in_time_widget.py @@ -31,7 +31,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No reference data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -60,19 +60,19 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) - if production_data is not None: + if current_data is not None: if target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) #plot output correlations abs_perc_error_time = go.Figure() abs_perc_error = list(map(lambda x : 100*abs(x[0] - x[1])/x[0], - zip(production_data[target_column], production_data[prediction_column]))) + zip(current_data[target_column], current_data[prediction_column]))) error_trace = go.Scatter( - x = production_data[date_column] if date_column else production_data.index, + x = current_data[date_column] if date_column else current_data.index, y = abs_perc_error, mode = 'lines', name = 'Absolute Percentage Error', @@ -83,8 +83,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, ) zero_trace = go.Scatter( - x = production_data[date_column] if date_column else production_data.index, - y = [0]*production_data.shape[0], + x = current_data[date_column] if date_column else current_data.index, + y = [0]*current_data.shape[0], mode = 'lines', opacity=0.5, marker=dict( diff --git a/evidently/widgets/reg_prod_colored_pred_vs_actual_widget.py b/evidently/widgets/reg_prod_colored_pred_vs_actual_widget.py index 2198bab258..62a403ea2e 100644 --- a/evidently/widgets/reg_prod_colored_pred_vs_actual_widget.py +++ b/evidently/widgets/reg_prod_colored_pred_vs_actual_widget.py @@ -31,7 +31,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No reference data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -60,26 +60,26 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) - if production_data is not None: + if current_data is not None: if target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) - prod_error = production_data[prediction_column] - production_data[target_column] + current_error = current_data[prediction_column] - current_data[target_column] - prod_quntile_5 = np.quantile(prod_error, .05) - prod_quntile_95 = np.quantile(prod_error, .95) + current_quntile_5 = np.quantile(current_error, .05) + current_quntile_95 = np.quantile(current_error, .95) - production_data['dataset'] = 'Current' - production_data['Error bias'] = list(map(lambda x : 'Underestimation' if x <= prod_quntile_5 else 'Majority' - if x < prod_quntile_95 else 'Overestimation', prod_error)) + current_data['dataset'] = 'Current' + current_data['Error bias'] = list(map(lambda x : 'Underestimation' if x <= current_quntile_5 else 'Majority' + if x < current_quntile_95 else 'Overestimation', current_error)) #plot output correlations pred_actual = go.Figure() pred_actual.add_trace(go.Scatter( - x = production_data[production_data['Error bias'] == 'Underestimation'][target_column], - y = production_data[production_data['Error bias'] == 'Underestimation'][prediction_column], + x = current_data[current_data['Error bias'] == 'Underestimation'][target_column], + y = current_data[current_data['Error bias'] == 'Underestimation'][prediction_column], mode = 'markers', name = 'Underestimation', marker = dict( @@ -89,8 +89,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, )) pred_actual.add_trace(go.Scatter( - x = production_data[production_data['Error bias'] == 'Overestimation'][target_column], - y = production_data[production_data['Error bias'] == 'Overestimation'][prediction_column], + x = current_data[current_data['Error bias'] == 'Overestimation'][target_column], + y = current_data[current_data['Error bias'] == 'Overestimation'][prediction_column], mode = 'markers', name = 'Overestimation', marker = dict( @@ -100,8 +100,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, )) pred_actual.add_trace(go.Scatter( - x = production_data[production_data['Error bias'] == 'Majority'][target_column], - y = production_data[production_data['Error bias'] == 'Majority'][prediction_column], + x = current_data[current_data['Error bias'] == 'Majority'][target_column], + y = current_data[current_data['Error bias'] == 'Majority'][prediction_column], mode = 'markers', name = 'Majority', marker = dict( diff --git a/evidently/widgets/reg_prod_error_distr_widget.py b/evidently/widgets/reg_prod_error_distr_widget.py index 6f2972ae65..3a1dfaec6a 100644 --- a/evidently/widgets/reg_prod_error_distr_widget.py +++ b/evidently/widgets/reg_prod_error_distr_widget.py @@ -31,7 +31,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No reference data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -60,15 +60,15 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) - if production_data is not None: + if current_data is not None: if target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) #plot output correlations error_distr = go.Figure() - error = production_data[prediction_column] - production_data[target_column] + error = current_data[prediction_column] - current_data[target_column] error_distr.add_trace(go.Histogram(x=error, marker_color=red, name = 'error distribution', histnorm = 'percent')) diff --git a/evidently/widgets/reg_prod_error_in_time_widget.py b/evidently/widgets/reg_prod_error_in_time_widget.py index 9b165b3436..c782d50b18 100644 --- a/evidently/widgets/reg_prod_error_in_time_widget.py +++ b/evidently/widgets/reg_prod_error_in_time_widget.py @@ -31,7 +31,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No prediction data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -60,17 +60,17 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) - if production_data is not None: + if current_data is not None: if target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) #plot output correlations pred_actual_time = go.Figure() error_trace = go.Scatter( - x = production_data[date_column] if date_column else production_data.index, - y = production_data[prediction_column] - production_data[target_column], + x = current_data[date_column] if date_column else current_data.index, + y = current_data[prediction_column] - current_data[target_column], mode = 'lines', name = 'Predicted - Actual', marker=dict( @@ -80,8 +80,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, ) zero_trace = go.Scatter( - x = production_data[date_column] if date_column else production_data.index, - y = [0]*production_data.shape[0], + x = current_data[date_column] if date_column else current_data.index, + y = [0]*current_data.shape[0], mode = 'lines', opacity=0.5, marker=dict( diff --git a/evidently/widgets/reg_prod_error_normality_widget.py b/evidently/widgets/reg_prod_error_normality_widget.py index 8f69652bec..f9dd572f56 100644 --- a/evidently/widgets/reg_prod_error_normality_widget.py +++ b/evidently/widgets/reg_prod_error_normality_widget.py @@ -30,7 +30,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No prediction data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -59,15 +59,15 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) - if production_data is not None: + if current_data is not None: if target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) #plot output correlations error_norm = go.Figure() - error = production_data[prediction_column] - production_data[target_column] + error = current_data[prediction_column] - current_data[target_column] qq_lines = probplot(error, dist="norm", plot=None) theoretical_q_x = np.linspace(qq_lines[0][0][0], qq_lines[0][0][-1], 100) diff --git a/evidently/widgets/reg_prod_pred_and_actual_in_time_widget.py b/evidently/widgets/reg_prod_pred_and_actual_in_time_widget.py index 0b2d9f1e79..4ef2e5c2ef 100644 --- a/evidently/widgets/reg_prod_pred_and_actual_in_time_widget.py +++ b/evidently/widgets/reg_prod_pred_and_actual_in_time_widget.py @@ -30,7 +30,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No reference data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -59,17 +59,17 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) - if production_data is not None: + if current_data is not None: if target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) #plot output correlations pred_actual_time = go.Figure() target_trace = go.Scatter( - x = production_data[date_column] if date_column else production_data.index, - y = production_data[target_column], + x = current_data[date_column] if date_column else current_data.index, + y = current_data[target_column], mode = 'lines', name = 'Actual', marker=dict( @@ -79,8 +79,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, ) pred_trace = go.Scatter( - x = production_data[date_column] if date_column else production_data.index, - y = production_data[prediction_column], + x = current_data[date_column] if date_column else current_data.index, + y = current_data[prediction_column], mode = 'lines', name = 'Predicted', marker=dict( @@ -90,8 +90,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, ) zero_trace = go.Scatter( - x = production_data[date_column] if date_column else production_data.index, - y = [0]*production_data.shape[0], + x = current_data[date_column] if date_column else current_data.index, + y = [0]*current_data.shape[0], mode = 'lines', opacity=0.5, marker=dict( diff --git a/evidently/widgets/reg_prod_pred_vs_actual_widget.py b/evidently/widgets/reg_prod_pred_vs_actual_widget.py index 329611ca6b..d1150773a2 100644 --- a/evidently/widgets/reg_prod_pred_vs_actual_widget.py +++ b/evidently/widgets/reg_prod_pred_vs_actual_widget.py @@ -31,7 +31,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No reference data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -60,17 +60,17 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) - if production_data is not None: + if current_data is not None: if target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) #plot output correlations pred_actual = go.Figure() pred_actual.add_trace(go.Scatter( - x = production_data[target_column], - y = production_data[prediction_column], + x = current_data[target_column], + y = current_data[prediction_column], mode = 'markers', name = 'Current', marker = dict( diff --git a/evidently/widgets/reg_prod_quality_metrics_widget.py b/evidently/widgets/reg_prod_quality_metrics_widget.py index 20340c0942..87b1068f1a 100644 --- a/evidently/widgets/reg_prod_quality_metrics_widget.py +++ b/evidently/widgets/reg_prod_quality_metrics_widget.py @@ -33,7 +33,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi #raise ValueError("No reference data with target and prediction provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -62,20 +62,20 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) - if production_data is not None: + if current_data is not None: if target_column is not None and prediction_column is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) #calculate quality metrics - me = np.mean(production_data[prediction_column] - production_data[target_column]) - sde = np.std(production_data[prediction_column] - production_data[target_column], ddof = 1) + me = np.mean(current_data[prediction_column] - current_data[target_column]) + sde = np.std(current_data[prediction_column] - current_data[target_column], ddof = 1) - abs_err = np.abs(production_data[prediction_column] - production_data[target_column]) + abs_err = np.abs(current_data[prediction_column] - current_data[target_column]) mae = np.mean(abs_err) sdae = np.std(abs_err, ddof = 1) - abs_perc_err = 100.*np.abs(production_data[prediction_column] - production_data[target_column])/production_data[target_column] + abs_perc_err = 100.*np.abs(current_data[prediction_column] - current_data[target_column])/current_data[target_column] mape = np.mean(abs_perc_err) sdape = np.std(abs_perc_err, ddof = 1) diff --git a/evidently/widgets/reg_prod_underperform_metrics_widget.py b/evidently/widgets/reg_prod_underperform_metrics_widget.py index cc5f281571..a5eb5036bb 100644 --- a/evidently/widgets/reg_prod_underperform_metrics_widget.py +++ b/evidently/widgets/reg_prod_underperform_metrics_widget.py @@ -29,7 +29,7 @@ def analyzers(self): def get_info(self) -> BaseWidgetInfo: return self.wi - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -59,24 +59,24 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) - if production_data is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + if current_data is not None: + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) - prod_error = production_data[prediction_column] - production_data[target_column] + current_error = current_data[prediction_column] - current_data[target_column] - prod_quantile_5 = np.quantile(prod_error, .05) - prod_quantile_95 = np.quantile(prod_error, .95) + current_quantile_5 = np.quantile(current_error, .05) + current_quantile_95 = np.quantile(current_error, .95) - prod_mae = np.mean(prod_error) - prod_mae_under = np.mean(prod_error[prod_error <= prod_quantile_5]) - prod_mae_exp = np.mean(prod_error[(prod_error > prod_quantile_5) & (prod_error < prod_quantile_95)]) - prod_mae_over = np.mean(prod_error[prod_error >= prod_quantile_95]) + current_mae = np.mean(current_error) + current_mae_under = np.mean(current_error[current_error <= current_quantile_5]) + current_mae_exp = np.mean(current_error[(current_error > current_quantile_5) & (current_error < current_quantile_95)]) + current_mae_over = np.mean(current_error[current_error >= current_quantile_95]) - prod_sd = np.std(prod_error, ddof = 1) - prod_sd_under = np.std(prod_error[prod_error <= prod_quantile_5], ddof = 1) - prod_sd_exp = np.std(prod_error[(prod_error > prod_quantile_5) & (prod_error < prod_quantile_95)], ddof = 1) - prod_sd_over = np.std(prod_error[prod_error >= prod_quantile_95], ddof = 1) + current_sd = np.std(current_error, ddof = 1) + current_sd_under = np.std(current_error[current_error <= current_quantile_5], ddof = 1) + current_sd_exp = np.std(current_error[(current_error > current_quantile_5) & (current_error < current_quantile_95)], ddof = 1) + current_sd_over = np.std(current_error[current_error >= current_quantile_95], ddof = 1) self.wi = BaseWidgetInfo( title=self.title, @@ -90,19 +90,19 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, params={ "counters": [ { - "value": str(round(prod_mae_exp, 2)) + " (" + str(round(prod_sd_exp, 2)) + ")", + "value": str(round(current_mae_exp, 2)) + " (" + str(round(current_sd_exp, 2)) + ")", "label": "Majority(90%)" }, #{ - # "value": str(round(prod_mae_exp, 2)) + " (" + str(round(prod_sd_exp,2)) + ")", + # "value": str(round(current_mae_exp, 2)) + " (" + str(round(current_sd_exp,2)) + ")", # "label": "Expected" #}, { - "value": str(round(prod_mae_under, 2)) + " (" + str(round(prod_sd_under, 2)) + ")", + "value": str(round(current_mae_under, 2)) + " (" + str(round(current_sd_under, 2)) + ")", "label": "Underestimation(5%)" }, { - "value": str(round(prod_mae_over, 2)) + " (" + str(round(prod_sd_over, 2)) + ")", + "value": str(round(current_mae_over, 2)) + " (" + str(round(current_sd_over, 2)) + ")", "label": "Overestimation(5%)" } ] diff --git a/evidently/widgets/reg_ref_abs_perc_error_in_time_widget.py b/evidently/widgets/reg_ref_abs_perc_error_in_time_widget.py index 4c26a37f50..f919728a82 100644 --- a/evidently/widgets/reg_ref_abs_perc_error_in_time_widget.py +++ b/evidently/widgets/reg_ref_abs_perc_error_in_time_widget.py @@ -31,7 +31,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No reference data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') diff --git a/evidently/widgets/reg_ref_colored_pred_vs_actual_widget.py b/evidently/widgets/reg_ref_colored_pred_vs_actual_widget.py index cc15869690..24dec56572 100644 --- a/evidently/widgets/reg_ref_colored_pred_vs_actual_widget.py +++ b/evidently/widgets/reg_ref_colored_pred_vs_actual_widget.py @@ -31,7 +31,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No reference data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -130,7 +130,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, alerts=[], alertsPosition="row", insights=[], - size=1 if production_data is not None else 2, + size=1 if current_data is not None else 2, params={ "data": pred_actual_json['data'], "layout": pred_actual_json['layout'] diff --git a/evidently/widgets/reg_ref_error_distr_widget.py b/evidently/widgets/reg_ref_error_distr_widget.py index c04ce5469e..f75d54a415 100644 --- a/evidently/widgets/reg_ref_error_distr_widget.py +++ b/evidently/widgets/reg_ref_error_distr_widget.py @@ -31,7 +31,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No reference data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') diff --git a/evidently/widgets/reg_ref_error_in_time_widget.py b/evidently/widgets/reg_ref_error_in_time_widget.py index 511d5ccfbc..5fde1660cb 100644 --- a/evidently/widgets/reg_ref_error_in_time_widget.py +++ b/evidently/widgets/reg_ref_error_in_time_widget.py @@ -31,7 +31,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No prediction data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') diff --git a/evidently/widgets/reg_ref_error_normality_widget.py b/evidently/widgets/reg_ref_error_normality_widget.py index b76f319162..d3da3ada90 100644 --- a/evidently/widgets/reg_ref_error_normality_widget.py +++ b/evidently/widgets/reg_ref_error_normality_widget.py @@ -30,7 +30,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No prediction data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') diff --git a/evidently/widgets/reg_ref_pred_and_actual_in_time_widget.py b/evidently/widgets/reg_ref_pred_and_actual_in_time_widget.py index cf60dc6289..979ab45d6e 100644 --- a/evidently/widgets/reg_ref_pred_and_actual_in_time_widget.py +++ b/evidently/widgets/reg_ref_pred_and_actual_in_time_widget.py @@ -31,7 +31,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No reference data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') diff --git a/evidently/widgets/reg_ref_pred_vs_actual_widget.py b/evidently/widgets/reg_ref_pred_vs_actual_widget.py index 26606e9736..be1fa0c71b 100644 --- a/evidently/widgets/reg_ref_pred_vs_actual_widget.py +++ b/evidently/widgets/reg_ref_pred_vs_actual_widget.py @@ -31,7 +31,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No reference data provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') diff --git a/evidently/widgets/reg_ref_quality_metrics_widget.py b/evidently/widgets/reg_ref_quality_metrics_widget.py index 139f49428d..e8e9de7fdd 100644 --- a/evidently/widgets/reg_ref_quality_metrics_widget.py +++ b/evidently/widgets/reg_ref_quality_metrics_widget.py @@ -33,7 +33,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No reference data with target and prediction provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') diff --git a/evidently/widgets/reg_ref_underperform_metrics_widget.py b/evidently/widgets/reg_ref_underperform_metrics_widget.py index f0bcbfc221..4e0a2ebee2 100644 --- a/evidently/widgets/reg_ref_underperform_metrics_widget.py +++ b/evidently/widgets/reg_ref_underperform_metrics_widget.py @@ -31,7 +31,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("no widget info provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') diff --git a/evidently/widgets/reg_target_name_widget.py b/evidently/widgets/reg_target_name_widget.py index e781bfce0a..cd1708b4ef 100644 --- a/evidently/widgets/reg_target_name_widget.py +++ b/evidently/widgets/reg_target_name_widget.py @@ -33,7 +33,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("No reference data with target and prediction provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') diff --git a/evidently/widgets/reg_underperform_segments_table_widget.py b/evidently/widgets/reg_underperform_segments_table_widget.py index 6eec0ccc97..b0fd58a143 100644 --- a/evidently/widgets/reg_underperform_segments_table_widget.py +++ b/evidently/widgets/reg_underperform_segments_table_widget.py @@ -32,7 +32,7 @@ def get_info(self) -> BaseWidgetInfo: return self.wi raise ValueError("no widget info provided") - def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping, analyzes_results): + def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping, analyzes_results): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') @@ -62,34 +62,34 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) - if production_data is not None: - production_data.replace([np.inf, -np.inf], np.nan, inplace=True) - production_data.dropna(axis=0, how='any', inplace=True) + if current_data is not None: + current_data.replace([np.inf, -np.inf], np.nan, inplace=True) + current_data.dropna(axis=0, how='any', inplace=True) reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) ref_error = reference_data[prediction_column] - reference_data[target_column] - prod_error = production_data[prediction_column] - production_data[target_column] + current_error = current_data[prediction_column] - current_data[target_column] ref_quntile_5 = np.quantile(ref_error, .05) ref_quntile_95 = np.quantile(ref_error, .95) - prod_quntile_5 = np.quantile(prod_error, .05) - prod_quntile_95 = np.quantile(prod_error, .95) + current_quntile_5 = np.quantile(current_error, .05) + current_quntile_95 = np.quantile(current_error, .95) #create subplots reference_data['dataset'] = 'Reference' reference_data['Error bias'] = list(map(lambda x : 'Underestimation' if x <= ref_quntile_5 else 'Majority' if x < ref_quntile_95 else 'Overestimation', ref_error)) - production_data['dataset'] = 'Current' - production_data['Error bias'] = list(map(lambda x : 'Underestimation' if x <= prod_quntile_5 else 'Majority' - if x < prod_quntile_95 else 'Overestimation', prod_error)) - merged_data = pd.concat([reference_data, production_data]) + current_data['dataset'] = 'Current' + current_data['Error bias'] = list(map(lambda x : 'Underestimation' if x <= current_quntile_5 else 'Majority' + if x < current_quntile_95 else 'Overestimation', current_error)) + merged_data = pd.concat([reference_data, current_data]) reference_data.drop(['dataset', 'Error bias'], axis=1, inplace=True) - production_data.drop(['dataset', 'Error bias'], axis=1, inplace=True) + current_data.drop(['dataset', 'Error bias'], axis=1, inplace=True) params_data = [] additional_graphs_data = [] @@ -103,11 +103,11 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, ref_over_value = np.mean(reference_data[ref_error >= ref_quntile_95][feature_name]) ref_range_value = 0 if ref_over_value == ref_under_value else 100*abs(ref_over_value - ref_under_value)/(np.max(reference_data[feature_name]) - np.min(reference_data[feature_name])) - prod_overal_value = np.mean(production_data[feature_name]) - prod_under_value = np.mean(production_data[prod_error <= prod_quntile_5][feature_name]) - prod_expected_value = np.mean(production_data[(prod_error > prod_quntile_5) & (prod_error < prod_quntile_95)][feature_name]) - prod_over_value = np.mean(production_data[prod_error >= prod_quntile_95][feature_name]) - prod_range_value = 0 if prod_over_value == prod_under_value else 100*abs(prod_over_value - prod_under_value)/(np.max(production_data[feature_name]) - np.min(production_data[feature_name])) + current_overal_value = np.mean(current_data[feature_name]) + current_under_value = np.mean(current_data[current_error <= current_quntile_5][feature_name]) + current_expected_value = np.mean(current_data[(current_error > current_quntile_5) & (current_error < current_quntile_95)][feature_name]) + current_over_value = np.mean(current_data[current_error >= current_quntile_95][feature_name]) + current_range_value = 0 if current_over_value == current_under_value else 100*abs(current_over_value - current_under_value)/(np.max(current_data[feature_name]) - np.min(current_data[feature_name])) feature_hist = px.histogram(merged_data, x=feature_name, color='Error bias', facet_col="dataset", @@ -124,8 +124,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, mode = 'markers', marker=dict( size=6, - cmax=max(max(reference_data[feature_name]), max(production_data[feature_name])), - cmin=min(min(reference_data[feature_name]), min(production_data[feature_name])), + cmax=max(max(reference_data[feature_name]), max(current_data[feature_name])), + cmin=min(min(reference_data[feature_name]), min(current_data[feature_name])), color=reference_data[feature_name], #colorbar=dict( # title="Colorbar" @@ -138,15 +138,15 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, segment_fig.add_trace( go.Scatter( - x = production_data[target_column], - y = production_data[prediction_column], + x = current_data[target_column], + y = current_data[prediction_column], mode = 'markers', #name = feature_name + ' (curr)', marker=dict( size=6, - cmax=max(max(reference_data[feature_name]), max(production_data[feature_name])), - cmin=min(min(reference_data[feature_name]), min(production_data[feature_name])), - color=production_data[feature_name], + cmax=max(max(reference_data[feature_name]), max(current_data[feature_name])), + cmin=min(min(reference_data[feature_name]), min(current_data[feature_name])), + color=current_data[feature_name], colorbar=dict( title=feature_name ), @@ -190,10 +190,10 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, "f4": round(ref_under_value, 2), "f5": round(ref_over_value, 2), "f6": round(ref_range_value, 2), - "f7": round(prod_expected_value, 2), - "f8": round(prod_under_value, 2), - "f9": round(prod_over_value, 2), - "f10": round(prod_range_value, 2) + "f7": round(current_expected_value, 2), + "f8": round(current_under_value, 2), + "f9": round(current_over_value, 2), + "f10": round(current_range_value, 2) } ) @@ -227,12 +227,12 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, ref_range_value = 1 if (ref_overal_value != ref_under_value) or (ref_over_value != ref_overal_value) \ or (ref_under_value != ref_overal_value) else 0 - prod_overal_value = production_data[feature_name].value_counts().idxmax() - prod_under_value = production_data[prod_error <= prod_quntile_5][feature_name].value_counts().idxmax() - #prod_expected_value = production_data[(prod_error > prod_quntile_5) & (prod_error < prod_quntile_95)][feature_name].value_counts().idxmax() - prod_over_value = production_data[prod_error >= prod_quntile_95][feature_name].value_counts().idxmax() - prod_range_value = 1 if (prod_overal_value != prod_under_value) or (prod_over_value != prod_overal_value) \ - or (prod_under_value != prod_overal_value) else 0 + current_overal_value = current_data[feature_name].value_counts().idxmax() + current_under_value = current_data[current_error <= current_quntile_5][feature_name].value_counts().idxmax() + #current_expected_value = current_data[(current_error > current_quntile_5) & (current_error < current_quntile_95)][feature_name].value_counts().idxmax() + current_over_value = current_data[current_error >= current_quntile_95][feature_name].value_counts().idxmax() + current_range_value = 1 if (current_overal_value != current_under_value) or (current_over_value != current_overal_value) \ + or (current_under_value != current_overal_value) else 0 feature_hist = px.histogram(merged_data, x=feature_name, color='Error bias', facet_col="dataset", histnorm = 'percent', barmode='overlay', category_orders={"dataset": ["Reference", "Current"], "Error bias": ["Underestimation", "Overestimation", "Majority"]}) @@ -250,8 +250,8 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, #marker_color = reference_data[feature_name], marker=dict( size=6, - cmax=max(max(reference_data[feature_name]), max(production_data[feature_name])), - cmin=min(min(reference_data[feature_name]), min(production_data[feature_name])), + cmax=max(max(reference_data[feature_name]), max(current_data[feature_name])), + cmin=min(min(reference_data[feature_name]), min(current_data[feature_name])), color=reference_data[feature_name], #colorbar=dict( # title="Colorbar" @@ -264,16 +264,16 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, segment_fig.add_trace( go.Scatter( - x = production_data[target_column], - y = production_data[prediction_column], + x = current_data[target_column], + y = current_data[prediction_column], mode = 'markers', #name = feature_name + ' (curr)', - #marker_color = production_data[feature_name], + #marker_color = current_data[feature_name], marker=dict( size=6, - cmax=max(max(reference_data[feature_name]), max(production_data[feature_name])), - cmin=min(min(reference_data[feature_name]), min(production_data[feature_name])), - color=production_data[feature_name], + cmax=max(max(reference_data[feature_name]), max(current_data[feature_name])), + cmin=min(min(reference_data[feature_name]), min(current_data[feature_name])), + color=current_data[feature_name], colorbar=dict( title=feature_name ), @@ -315,10 +315,10 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, "f4": str(ref_under_value), "f5": str(ref_over_value), "f6": str(ref_range_value), - "f7": str(prod_overal_value), - "f8": str(prod_under_value), - "f9": str(prod_over_value), - "f10": int(prod_range_value) + "f7": str(current_overal_value), + "f8": str(current_under_value), + "f9": str(current_over_value), + "f10": int(current_range_value) } ) @@ -431,7 +431,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, category_orders={"Error bias": ["Underestimation", "Overestimation", "Majority"]}) #hist_fig = px.histogram(reference_data, x=feature_name, color=target_column, facet_col="dataset", - # category_orders={"dataset": ["Reference", "Production"]}) + # category_orders={"dataset": ["Reference", "Сurrent"]}) hist_figure = json.loads(hist.to_json()) @@ -498,7 +498,7 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, barmode='overlay', category_orders={"Error bias": ["Underestimation", "Overestimation", "Majority"]}) #hist_fig = px.histogram(reference_data, x=feature_name, color=target_column, facet_col="dataset", - # category_orders={"dataset": ["Reference", "Production"]}) + # category_orders={"dataset": ["Reference", "Сurrent"]}) hist_figure = json.loads(hist.to_json()) diff --git a/evidently/widgets/widget.py b/evidently/widgets/widget.py index 093b2c0ff6..34c5b35cf5 100644 --- a/evidently/widgets/widget.py +++ b/evidently/widgets/widget.py @@ -14,7 +14,7 @@ def __init__(self): @abc.abstractmethod def calculate(self, reference_data: pandas.DataFrame, - production_data: pandas.DataFrame, column_mapping, analyzers_results): + current_data: pandas.DataFrame, column_mapping, analyzers_results): raise NotImplemented() @abc.abstractmethod diff --git a/setup.py b/setup.py index 58513888b5..6747b7348d 100644 --- a/setup.py +++ b/setup.py @@ -78,6 +78,7 @@ "statsmodels", "plotly", "scipy", + "pyyaml", "scikit-learn>=0.22.1" ], entry_points={