diff --git a/src/fklearn/causal/validation/auc.py b/src/fklearn/causal/validation/auc.py index 15ca81b3..adefade3 100644 --- a/src/fklearn/causal/validation/auc.py +++ b/src/fklearn/causal/validation/auc.py @@ -13,6 +13,7 @@ def area_under_the_cumulative_effect_curve(df: pd.DataFrame, prediction: str, min_rows: int = 30, steps: int = 100, + ascending: bool = False, effect_fn: EffectFnType = linear_effect) -> float: """ Orders the dataset by prediction and computes the area under the cumulative effect curve, according to that @@ -38,6 +39,9 @@ def area_under_the_cumulative_effect_curve(df: pd.DataFrame, steps : Integer The number of cumulative steps to iterate when accumulating the effect + ascending : Boolean + Indicates if the dataset should be ordered ascending with respect to the prediction column + effect_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> int or Array of int A function that computes the treatment effect given a dataframe, the name of the treatment column and the name of the outcome column. @@ -55,7 +59,7 @@ def area_under_the_cumulative_effect_curve(df: pd.DataFrame, step_sizes = [min_rows] + [t - s for s, t in zip(n_rows, n_rows[1:])] cum_effect = cumulative_effect_curve(df=df, treatment=treatment, outcome=outcome, prediction=prediction, - min_rows=min_rows, steps=steps, effect_fn=effect_fn) + min_rows=min_rows, steps=steps, ascending=ascending, effect_fn=effect_fn) return abs(sum([(effect - ate) * (step_size / size) for effect, step_size in zip(cum_effect, step_sizes)])) @@ -67,6 +71,7 @@ def area_under_the_cumulative_gain_curve(df: pd.DataFrame, prediction: str, min_rows: int = 30, steps: int = 100, + ascending: bool = False, effect_fn: EffectFnType = linear_effect) -> float: """ Orders the dataset by prediction and computes the area under the cumulative gain curve, according to that ordering. @@ -91,6 +96,9 @@ def area_under_the_cumulative_gain_curve(df: pd.DataFrame, steps : Integer The number of cumulative steps to iterate when accumulating the effect + ascending : Boolean + Indicates if the dataset should be ordered ascending with respect to the prediction column + effect_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> int or Array of int A function that computes the treatment effect given a dataframe, the name of the treatment column and the name of the outcome column. @@ -107,7 +115,7 @@ def area_under_the_cumulative_gain_curve(df: pd.DataFrame, step_sizes = [min_rows] + [t - s for s, t in zip(n_rows, n_rows[1:])] cum_effect = cumulative_effect_curve(df=df, treatment=treatment, outcome=outcome, prediction=prediction, - min_rows=min_rows, steps=steps, effect_fn=effect_fn) + min_rows=min_rows, steps=steps, ascending=ascending, effect_fn=effect_fn) return abs(sum([effect * (rows / size) * (step_size / size) for rows, effect, step_size in zip(n_rows, cum_effect, step_sizes)])) @@ -120,6 +128,7 @@ def area_under_the_relative_cumulative_gain_curve(df: pd.DataFrame, prediction: str, min_rows: int = 30, steps: int = 100, + ascending: bool = False, effect_fn: EffectFnType = linear_effect) -> float: """ Orders the dataset by prediction and computes the area under the relative cumulative gain curve, according to that @@ -145,6 +154,9 @@ def area_under_the_relative_cumulative_gain_curve(df: pd.DataFrame, steps : Integer The number of cumulative steps to iterate when accumulating the effect + ascending : Boolean + Indicates if the dataset should be ordered ascending with respect to the prediction column + effect_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> int or Array of int A function that computes the treatment effect given a dataframe, the name of the treatment column and the name of the outcome column. @@ -162,7 +174,7 @@ def area_under_the_relative_cumulative_gain_curve(df: pd.DataFrame, step_sizes = [min_rows] + [t - s for s, t in zip(n_rows, n_rows[1:])] cum_effect = cumulative_effect_curve(df=df, treatment=treatment, outcome=outcome, prediction=prediction, - min_rows=min_rows, steps=steps, effect_fn=effect_fn) + min_rows=min_rows, steps=steps, ascending=ascending, effect_fn=effect_fn) return abs(sum([(effect - ate) * (rows / size) * (step_size / size) for rows, effect, step_size in zip(n_rows, cum_effect, step_sizes)])) diff --git a/src/fklearn/causal/validation/curves.py b/src/fklearn/causal/validation/curves.py index f3852479..8f9b53c4 100644 --- a/src/fklearn/causal/validation/curves.py +++ b/src/fklearn/causal/validation/curves.py @@ -59,6 +59,7 @@ def cumulative_effect_curve(df: pd.DataFrame, prediction: str, min_rows: int = 30, steps: int = 100, + ascending: bool = False, effect_fn: EffectFnType = linear_effect) -> np.ndarray: """ Orders the dataset by prediction and computes the cumulative effect curve according to that ordering @@ -83,6 +84,9 @@ def cumulative_effect_curve(df: pd.DataFrame, steps : Integer The number of cumulative steps to iterate when accumulating the effect + ascending : Boolean + Indicates if the dataset should be ordered ascending with respect to the prediction column + effect_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> int or Array of int A function that computes the treatment effect given a dataframe, the name of the treatment column and the name of the outcome column. @@ -95,7 +99,7 @@ def cumulative_effect_curve(df: pd.DataFrame, """ size = df.shape[0] - ordered_df = df.sort_values(prediction, ascending=False).reset_index(drop=True) + ordered_df = df.sort_values(prediction, ascending=ascending).reset_index(drop=True) n_rows = list(range(min_rows, size, size // steps)) + [size] return np.array([effect_fn(ordered_df.head(rows), treatment, outcome) for rows in n_rows]) @@ -107,6 +111,7 @@ def cumulative_gain_curve(df: pd.DataFrame, prediction: str, min_rows: int = 30, steps: int = 100, + ascending: bool = False, effect_fn: EffectFnType = linear_effect) -> np.ndarray: """ Orders the dataset by prediction and computes the cumulative gain (effect * proportional sample size) curve @@ -132,6 +137,9 @@ def cumulative_gain_curve(df: pd.DataFrame, steps : Integer The number of cumulative steps to iterate when accumulating the effect + ascending : Boolean + Indicates if the dataset should be ordered ascending with respect to the prediction column + effect_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> int or Array of int A function that computes the treatment effect given a dataframe, the name of the treatment column and the name of the outcome column. @@ -147,7 +155,7 @@ def cumulative_gain_curve(df: pd.DataFrame, n_rows = list(range(min_rows, size, size // steps)) + [size] cum_effect = cumulative_effect_curve(df=df, treatment=treatment, outcome=outcome, prediction=prediction, - min_rows=min_rows, steps=steps, effect_fn=effect_fn) + min_rows=min_rows, steps=steps, ascending=ascending, effect_fn=effect_fn) return np.array([effect * (rows / size) for rows, effect in zip(n_rows, cum_effect)]) @@ -159,6 +167,7 @@ def relative_cumulative_gain_curve(df: pd.DataFrame, prediction: str, min_rows: int = 30, steps: int = 100, + ascending: bool = False, effect_fn: EffectFnType = linear_effect) -> np.ndarray: """ Orders the dataset by prediction and computes the relative cumulative gain curve curve according to that ordering. @@ -185,6 +194,9 @@ def relative_cumulative_gain_curve(df: pd.DataFrame, steps : Integer The number of cumulative steps to iterate when accumulating the effect + ascending : Boolean + Indicates if the dataset should be ordered ascending with respect to the prediction column + effect_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> int or Array of int A function that computes the treatment effect given a dataframe, the name of the treatment column and the name of the outcome column. @@ -201,7 +213,7 @@ def relative_cumulative_gain_curve(df: pd.DataFrame, n_rows = list(range(min_rows, size, size // steps)) + [size] cum_effect = cumulative_effect_curve(df=df, treatment=treatment, outcome=outcome, prediction=prediction, - min_rows=min_rows, steps=steps, effect_fn=effect_fn) + min_rows=min_rows, steps=steps, ascending=ascending, effect_fn=effect_fn) return np.array([(effect - ate) * (rows / size) for rows, effect in zip(n_rows, cum_effect)]) @@ -214,6 +226,7 @@ def effect_curves( prediction: str, min_rows: int = 30, steps: int = 100, + ascending: bool = False, effect_fn: EffectFnType = linear_effect, ) -> pd.DataFrame: """ @@ -243,6 +256,9 @@ def effect_curves( steps : Integer The number of cumulative steps to iterate when accumulating the effect + ascending : Boolean + Indicates if the dataset should be ordered ascending with respect to the prediction column + effect_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> int or Array of int A function that computes the treatment effect given a dataframe, the name of the treatment column and the name of the outcome column. @@ -264,6 +280,7 @@ def effect_curves( prediction=prediction, min_rows=min_rows, steps=steps, + ascending=ascending, effect_fn=effect_fn, ) ate: float = cum_effect[-1] diff --git a/tests/causal/validation/test_curves.py b/tests/causal/validation/test_curves.py index 98e6cc74..cccc55f2 100644 --- a/tests/causal/validation/test_curves.py +++ b/tests/causal/validation/test_curves.py @@ -28,12 +28,16 @@ def test_cumulative_effect_curve(): y=[1, 1, 1, 2, 3, 4, 3, 5, 7], )) - expected = np.array([3., 3., 2.92857143, 2.5, 2.5, 2.46153846, 2.]) + asc_expected = np.array([1., 1., 1.07142857, 1.5, 1.5, 1.53846154, 2.]) + desc_expected = np.array([3., 3., 2.92857143, 2.5, 2.5, 2.46153846, 2.]) - result = cumulative_effect_curve(df, prediction="x", outcome="y", treatment="t", min_rows=3, steps=df.shape[0], - effect_fn=linear_effect) + asc_result = cumulative_effect_curve(df, prediction="x", outcome="y", treatment="t", min_rows=3, steps=df.shape[0], + ascending=True, effect_fn=linear_effect) + desc_result = cumulative_effect_curve(df, prediction="x", outcome="y", treatment="t", min_rows=3, steps=df.shape[0], + effect_fn=linear_effect) - np.testing.assert_allclose(expected, result, rtol=1e-07) + np.testing.assert_allclose(asc_expected, asc_result, rtol=1e-07) + np.testing.assert_allclose(desc_expected, desc_result, rtol=1e-07) def test_cumulative_gain_curve():