From cefdb94dd5e8f26bb3c565ecb8e67cdbd9a7467b Mon Sep 17 00:00:00 2001 From: Chahrazad650 Date: Tue, 3 Mar 2026 04:18:30 +0100 Subject: [PATCH] ajout : aout des tests test_cleaning.py --- cleaning.py | 23 ++++++++++++++--- main.py | 26 +++++++------------- test_cleaning.py | 64 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 21 deletions(-) create mode 100644 test_cleaning.py diff --git a/cleaning.py b/cleaning.py index efc1054..fcaabdb 100644 --- a/cleaning.py +++ b/cleaning.py @@ -5,12 +5,27 @@ import pandas as pd SCORE_COLS = ["Robert", "Robinson", "Suckling"] -def display_info(df: DataFrame) -> None: - df.describe() - print(df.info()) - print("\nNombre de valeurs manquantes par colonne :") +def display_info(df: DataFrame, name: str = "DataFrame") -> None: + """ + Affiche un résumé du DataFrame + -la taille + -types des colonnes + -valeurs manquantes + -statistiques numériques + """ + print(f"\n===== {name} =====") + + print(f"Shape : {df.shape[0]} lignes × {df.shape[1]} colonnes") + + print("\nTypes des colonnes :") + print(df.dtypes) + + print("\nValeurs manquantes :") print(df.isna().sum()) + print("\nStatistiques numériques :") + print(df.describe().round(2)) + def drop_empty_appellation(df: DataFrame) -> DataFrame: diff --git a/main.py b/main.py index b59e373..65cbd62 100755 --- a/main.py +++ b/main.py @@ -29,41 +29,33 @@ def main() -> None: df = load_csv(argv[1]) - print("=== Avant nettoyage ===") - display_info(df) + display_info(df, "Avant le nettoyage") df = drop_empty_appellation(df) save_csv(df, "donnee_clean.csv") - - print("\n=== Après nettoyage d'appellations manquantes ===") - display_info(df) + display_info(df, "Après nettoyage d'appellations manquantes") #la moyenne des notes des vins pour chaque appellation robert_means = mean_robert(df) save_csv(robert_means, "mean_robert_by_appellation.csv") - print("\n=== moyenne Robert par appellation ===") - print(robert_means.head(10)) + display_info(robert_means, "Moyennes Robert par appellation") robinson_means = mean_robinson(df) save_csv(robinson_means, "mean_robinson_by_appellation.csv") - print("\n=== moyennes Robinson par appellation ===") - print(robinson_means.head(10)) - + display_info(robinson_means, "Moyennes Robinson par appellation") + suckling_means = mean_suckling(df) save_csv(suckling_means, "mean_suckling_by_appellation.csv") - print("\n=== moyennes Suckling par appellation ===") - print(suckling_means.head(10)) + display_info(suckling_means, "Moyennes Suckling par appellation") df_missing_scores = fill_missing_scores(df) save_csv(df_missing_scores, "donnee_filled.csv") - print("\n=== Après remplissage des notes manquantes ===") - display_info(df_missing_scores) + display_info(df_missing_scores, "Après remplissage des notes manquantes par la moyenne de l'appellation") df_ready = encode_appellation(df_missing_scores) save_csv(df_ready, "donnee_ready.csv") - print("\n=== Après remplacer la colonne 'Appellation' par des colonnes indicatrices ===") - display_info(df_ready) - print(df_ready.filter(like="App_").any().head()) + display_info(df_ready, "Après remplacer la colonne 'Appellation' par des colonnes indicatrices") + if __name__ == "__main__": try: diff --git a/test_cleaning.py b/test_cleaning.py new file mode 100644 index 0000000..d376ccf --- /dev/null +++ b/test_cleaning.py @@ -0,0 +1,64 @@ +import pandas as pd +import pytest +from pandas import DataFrame + +from cleaning import ( + SCORE_COLS, + drop_empty_appellation, + mean_score, + fill_missing_scores, + encode_appellation, +) + + +@pytest.fixture +def df_raw() -> DataFrame: + return pd.DataFrame({ + "Appellation": ["Pauillac", "Pauillac ", "Margaux", None, "Pomerol", "Pomerol"], + "Robert": ["95", None, "bad", 90, None, None], + "Robinson": [None, "93", 18, None, None, None], + "Suckling": [96, None, None, None, 91, None], + "Prix": ["10.0", "11.0", "20.0", "30.0", "40.0", "50.0"], + }) + + +def test_drop_empty_appellation(df_raw: DataFrame): + out = drop_empty_appellation(df_raw) + assert out["Appellation"].isna().sum() == 0 + assert len(out) == 5 + + +def test_mean_score_zero_when_no_scores(df_raw: DataFrame): + out = drop_empty_appellation(df_raw) + m = mean_score(out, "Robert") + assert list(m.columns) == ["Appellation", "mean_Robert"] + + # Pomerol n'a aucune note Robert => moyenne doit être 0 + pomerol_mean = m.loc[m["Appellation"].str.strip() == "Pomerol", "mean_Robert"].iloc[0] + assert pomerol_mean == 0 + + +def test_fill_missing_scores(df_raw: DataFrame): + out = drop_empty_appellation(df_raw) + filled = fill_missing_scores(out) + + # plus de NaN dans les colonnes de scores + for col in SCORE_COLS: + assert filled[col].isna().sum() == 0 + + assert filled.loc[1, "Robert"] == 95.0 + + # pas de colonnes temporaires mean_* + for col in SCORE_COLS: + assert f"mean_{col}" not in filled.columns + + +def test_encode_appellation(df_raw: DataFrame): + out = drop_empty_appellation(df_raw) + filled = fill_missing_scores(out) + encoded = encode_appellation(filled) + + # la colonne texte disparaît + assert "Appellation" not in encoded.columns + assert "Pauillac" in encoded.columns + assert encoded.loc[0, "Pauillac"] == 1 \ No newline at end of file