ajout: restructuration de la cleaning

2026-03-28 11:03:41 +00:00 · 2026-03-06 17:56:07 +01:00
parent 69b8b4ce1f
commit acf4ddd881
2 changed files with 121 additions and 133 deletions
--- a/src/cleaning.py
+++ b/src/cleaning.py
@@ -1,103 +1,87 @@
 #!/usr/bin/env python3
-from pandas import DataFrame, to_numeric, get_dummies

-SCORE_COLS = ["Robert", "Robinson", "Suckling"]
+from typing import cast, override
+from pandas import DataFrame, read_csv, to_numeric, get_dummies


-def display_info(df: DataFrame, name: str = "DataFrame") -> None:
-    """
-    Affiche un résumé du DataFrame
-        -la taille
-        -types des colonnes
-        -valeurs manquantes
-        -statistiques numériques
-    """
-    print(f"\n===== {name} =====")
+class Cleaning:
+    def __init__(self, filename) -> None:
+        self._vins: DataFrame = read_csv(filename)
+        #
+        self.SCORE_COLS: list[str] = [
+            c for c in self._vins.columns if c not in ["Appellation", "Prix"]
+        ]
+        #
+        for col in self.SCORE_COLS:
+            self._vins[col] = to_numeric(self._vins[col], errors="coerce")

-    print(f"Shape : {df.shape[0]} lignes × {df.shape[1]} colonnes")
+    def getVins(self) -> DataFrame:
+        return self._vins.copy(deep=True)

-    print("\nTypes des colonnes :")
-    print(df.dtypes)
+    @override
+    def __str__(self) -> str:
+        """
+        Affiche un résumé du DataFrame
+            - la taille
+            - types des colonnes
+            - valeurs manquantes
+            - statistiques numériques
+        """
+        return (
+            f"Shape : {self._vins.shape[0]} lignes x {self._vins.shape[1]} colonnes\n\n"
+            f"Types des colonnes :\n{self._vins.dtypes}\n\n"
+            f"Valeurs manquantes :\n{self._vins.isna().sum()}\n\n"
+            f"Statistiques numériques :\n{self._vins.describe().round(2)}\n\n"
+        )

-    print("\nValeurs manquantes :")
-    print(df.isna().sum())
+    def drop_empty_appellation(self) -> Cleaning:
+        self._vins = self._vins.dropna(subset=["Appellation"])
+        return self

-    print("\nStatistiques numériques :")
-    print(df.describe().round(2))
+    def _mean_score(self, col: str) -> DataFrame:
+        """
+        Calcule la moyenne d'une colonne de score par appellation.
+            - Convertit les valeurs en numériques, en remplaçant les non-convertibles par NaN
+            - Calcule la moyenne par appellation
+            - Remplace les NaN résultants par 0

+        """
+        means = self._vins.groupby("Appellation", as_index=False)[col].mean()
+        means = means.rename(
+            columns={col: f"mean_{col}"}
+        )  # pyright: ignore[reportCallIssue]
+        return cast(DataFrame, means.fillna(0))

-def drop_empty_appellation(df: DataFrame) -> DataFrame:
+    def _mean_robert(self) -> DataFrame:
+        return self._mean_score("Robert")

-    return df.dropna(subset=["Appellation"])
+    def _mean_robinson(self) -> DataFrame:
+        return self._mean_score("Robinson")

+    def _mean_suckling(self) -> DataFrame:
+        return self._mean_score("Suckling")

-def mean_score(df: DataFrame, col: str) -> DataFrame:
-    """
-    Calcule la moyenne d'une colonne de score par appellation.
-        - Convertit les valeurs en numériques, en remplaçant les non-convertibles par NaN
-        - Calcule la moyenne par appellation
-        - Remplace les NaN résultants par 0
+    def fill_missing_scores(self) -> Cleaning:
+        """
+        Remplacer les notes manquantes par la moyenne
+        des vins de la même appellation.
+        """
+        for element in self.SCORE_COLS:
+            means = self._mean_score(element)
+            self._vins = self._vins.merge(means, on="Appellation", how="left")
+            
+            mean_col = f"mean_{element}"
+            self._vins[element] = self._vins[element].fillna(self._vins[mean_col])

-    """
-    tmp = df[["Appellation", col]].copy()
+            self._vins = self._vins.drop(columns=["mean_" + element])
+        return self

-    tmp[col] = to_numeric(tmp[col], errors="coerce")
-
-    # moyenne par appellation
-    means = tmp.groupby("Appellation", as_index=False)[col].mean()
-
-    means[col] = means[col].fillna(0)
-
-    means = means.rename(columns={col: f"mean_{col}"})
-
-
-def mean_robert(df: DataFrame) -> DataFrame:
-    return mean_score(df, "Robert")
-
-
-def mean_robinson(df: DataFrame) -> DataFrame:
-    return mean_score(df, "Robinson")
-
-
-def mean_suckling(df: DataFrame) -> DataFrame:
-    return mean_score(df, "Suckling")
-
-
-def fill_missing_scores(df: DataFrame) -> DataFrame:
-    """
-    Remplacer les notes manquantes par la moyenne
-    des vins de la même appellation.
-    """
-    df_copy = df.copy()
-    df_copy["Appellation"] = df_copy["Appellation"].astype(str).str.strip()
-
-    for score in SCORE_COLS:
-        df_copy[score] = to_numeric(df_copy[score], errors="coerce")
-
-    temp_cols: list[str] = []
-
-    for score in SCORE_COLS:
-        mean_df = mean_score(df_copy, score)
-        mean_name = f"mean_{score}"
-        temp_cols.append(mean_name)
-
-        df_copy = df_copy.merge(mean_df, on="Appellation", how="left")
-        df_copy[score] = df_copy[score].fillna(df_copy[mean_name])
-
-    df_copy = df_copy.drop(columns=temp_cols)
-    return df_copy
-
-
-def encode_appellation(df: DataFrame, column: str = "Appellation") -> DataFrame:
-    """
-    Remplace la colonne 'Appellation' par des colonnes indicatrices
-    """
-    df_copy = df.copy()
-
-    appellations = df_copy[column].astype(str).str.strip()
-
-    appellation_dummies = get_dummies(appellations)
-
-    df_copy = df_copy.drop(columns=[column])
-
-    return df_copy.join(appellation_dummies)
+    def encode_appellation(self, column: str = "Appellation") -> Cleaning:
+        """
+        Remplace la colonne 'Appellation' par des colonnes indicatrices
+        """
+        appellations = self._vins[column].astype(str).str.strip()
+        appellation_dummies = get_dummies(appellations)
+        self._vins = self._vins.drop(columns=[column])
+        self._vins = self._vins.join(appellation_dummies)
+        return self
--- a/tests/test_cleaning.py
+++ b/tests/test_cleaning.py
@@ -1,64 +1,68 @@
-import pandas as pd
 import pytest
 from pandas import DataFrame
-
-from cleaning import (
-    SCORE_COLS,
-    drop_empty_appellation,
-    mean_score,
-    fill_missing_scores,
-    encode_appellation,
-)
+from unittest.mock import patch, mock_open
+from cleaning import Cleaning


@pytest.fixture
-def df_raw() -> DataFrame:
-    return pd.DataFrame({
-        "Appellation": ["Pauillac", "Pauillac ", "Margaux", None, "Pomerol", "Pomerol"],
-        "Robert":      ["95", None, "bad", 90, None, None],
-        "Robinson":    [None, "93", 18, None, None, None],
-        "Suckling":    [96, None, None, None, 91, None],
-        "Prix":        ["10.0", "11.0", "20.0", "30.0", "40.0", "50.0"],
-    })
+def cleaning_raw() -> Cleaning:
+    """
+    "Appellation": ["Pauillac", "Pauillac ", "Margaux", None  , "Pomerol", "Pomerol"],
+    "Robert":      ["95"      , None       , "bad"    , 90    , None     , None     ],
+    "Robinson":    [None      , "93"       , 18       , None  , None     , None     ],
+    "Suckling":    [96        , None       , None     , None  , 91       , None     ],
+    "Prix":        ["10.0"    , "11.0"     , "20.0"   , "30.0", "40.0"   , "50.0"   ],
+    """
+    csv_content = """Appellation,Robert,Robinson,Suckling,Prix
+Pauillac,95,,96,10.0
+Pauillac ,,93,,11.0
+Margaux,bad,18,,20.0
+,90,,,30.0
+Pomerol,,,91,40.0
+Pomerol,,,,50.0
+"""
+    m = mock_open(read_data=csv_content)
+    with patch("builtins.open", m):
+        return Cleaning("donnee.csv")


-def test_drop_empty_appellation(df_raw: DataFrame):
-    out = drop_empty_appellation(df_raw)
+def test_drop_empty_appellation(cleaning_raw: Cleaning) -> None:
+    out = cleaning_raw.drop_empty_appellation().getVins()
    assert out["Appellation"].isna().sum() == 0
-    assert len(out) == 5 
+    assert len(out) == 5


-def test_mean_score_zero_when_no_scores(df_raw: DataFrame):
-    out = drop_empty_appellation(df_raw)
-    m = mean_score(out, "Robert")
+def test_mean_score_zero_when_no_scores(cleaning_raw: Cleaning) -> None:
+    out = cleaning_raw.drop_empty_appellation()
+    m = out._mean_score("Robert")
    assert list(m.columns) == ["Appellation", "mean_Robert"]
-
-    # Pomerol n'a aucune note Robert => moyenne doit être 0
-    pomerol_mean = m.loc[m["Appellation"].str.strip() == "Pomerol", "mean_Robert"].iloc[0]
+    pomerol_mean = m.loc[m["Appellation"].str.strip() == "Pomerol", "mean_Robert"].iloc[
+        0
+    ]
    assert pomerol_mean == 0


-def test_fill_missing_scores(df_raw: DataFrame):
-    out = drop_empty_appellation(df_raw)
-    filled = fill_missing_scores(out)
+def test_fill_missing_scores(cleaning_raw: Cleaning):
+    cleaning_raw._vins["Appellation"] = cleaning_raw._vins["Appellation"].str.strip()

-    # plus de NaN dans les colonnes de scores
-    for col in SCORE_COLS:
+    cleaning_raw.drop_empty_appellation()
+    filled = cleaning_raw.fill_missing_scores().getVins()
+    for col in cleaning_raw.SCORE_COLS:
        assert filled[col].isna().sum() == 0
-
-    assert filled.loc[1, "Robert"] == 95.0
-
-    # pas de colonnes temporaires mean_*
-    for col in SCORE_COLS:
-        assert f"mean_{col}" not in filled.columns
+        
+    pauillac_robert = filled[filled["Appellation"] == "Pauillac"]["Robert"]
+    assert (pauillac_robert == 95.0).all()


-def test_encode_appellation(df_raw: DataFrame):
-    out = drop_empty_appellation(df_raw)
-    filled = fill_missing_scores(out)
-    encoded = encode_appellation(filled)
+def test_encode_appellation(cleaning_raw: Cleaning):
+    cleaning_raw._vins["Appellation"] = cleaning_raw._vins["Appellation"].str.strip()

-    # la colonne texte disparaît
-    assert "Appellation" not in encoded.columns
-    assert "Pauillac" in encoded.columns
-    assert encoded.loc[0, "Pauillac"] == 1
+    out = (
+        cleaning_raw.drop_empty_appellation()
+        .fill_missing_scores()
+        .encode_appellation()
+        .getVins()
+    )
+    assert "Appellation" not in out.columns
+    assert "Pauillac" in out.columns
+    assert int(out.loc[0, "Pauillac"]) == 1