From acf4ddd881f9b0b027ad3a951a20102d85be32c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20GUEZO?= Date: Fri, 6 Mar 2026 17:56:07 +0100 Subject: [PATCH] ajout: restructuration de la cleaning --- src/cleaning.py | 160 +++++++++++++++++++---------------------- tests/test_cleaning.py | 94 ++++++++++++------------ 2 files changed, 121 insertions(+), 133 deletions(-) mode change 100644 => 100755 src/cleaning.py diff --git a/src/cleaning.py b/src/cleaning.py old mode 100644 new mode 100755 index 1f3f788..781896b --- a/src/cleaning.py +++ b/src/cleaning.py @@ -1,103 +1,87 @@ #!/usr/bin/env python3 -from pandas import DataFrame, to_numeric, get_dummies -SCORE_COLS = ["Robert", "Robinson", "Suckling"] +from typing import cast, override +from pandas import DataFrame, read_csv, to_numeric, get_dummies -def display_info(df: DataFrame, name: str = "DataFrame") -> None: - """ - Affiche un résumé du DataFrame - -la taille - -types des colonnes - -valeurs manquantes - -statistiques numériques - """ - print(f"\n===== {name} =====") +class Cleaning: + def __init__(self, filename) -> None: + self._vins: DataFrame = read_csv(filename) + # + self.SCORE_COLS: list[str] = [ + c for c in self._vins.columns if c not in ["Appellation", "Prix"] + ] + # + for col in self.SCORE_COLS: + self._vins[col] = to_numeric(self._vins[col], errors="coerce") - print(f"Shape : {df.shape[0]} lignes × {df.shape[1]} colonnes") + def getVins(self) -> DataFrame: + return self._vins.copy(deep=True) - print("\nTypes des colonnes :") - print(df.dtypes) + @override + def __str__(self) -> str: + """ + Affiche un résumé du DataFrame + - la taille + - types des colonnes + - valeurs manquantes + - statistiques numériques + """ + return ( + f"Shape : {self._vins.shape[0]} lignes x {self._vins.shape[1]} colonnes\n\n" + f"Types des colonnes :\n{self._vins.dtypes}\n\n" + f"Valeurs manquantes :\n{self._vins.isna().sum()}\n\n" + f"Statistiques numériques :\n{self._vins.describe().round(2)}\n\n" + ) - print("\nValeurs manquantes :") - print(df.isna().sum()) + def drop_empty_appellation(self) -> Cleaning: + self._vins = self._vins.dropna(subset=["Appellation"]) + return self - print("\nStatistiques numériques :") - print(df.describe().round(2)) + def _mean_score(self, col: str) -> DataFrame: + """ + Calcule la moyenne d'une colonne de score par appellation. + - Convertit les valeurs en numériques, en remplaçant les non-convertibles par NaN + - Calcule la moyenne par appellation + - Remplace les NaN résultants par 0 + """ + means = self._vins.groupby("Appellation", as_index=False)[col].mean() + means = means.rename( + columns={col: f"mean_{col}"} + ) # pyright: ignore[reportCallIssue] + return cast(DataFrame, means.fillna(0)) -def drop_empty_appellation(df: DataFrame) -> DataFrame: + def _mean_robert(self) -> DataFrame: + return self._mean_score("Robert") - return df.dropna(subset=["Appellation"]) + def _mean_robinson(self) -> DataFrame: + return self._mean_score("Robinson") + def _mean_suckling(self) -> DataFrame: + return self._mean_score("Suckling") -def mean_score(df: DataFrame, col: str) -> DataFrame: - """ - Calcule la moyenne d'une colonne de score par appellation. - - Convertit les valeurs en numériques, en remplaçant les non-convertibles par NaN - - Calcule la moyenne par appellation - - Remplace les NaN résultants par 0 + def fill_missing_scores(self) -> Cleaning: + """ + Remplacer les notes manquantes par la moyenne + des vins de la même appellation. + """ + for element in self.SCORE_COLS: + means = self._mean_score(element) + self._vins = self._vins.merge(means, on="Appellation", how="left") + + mean_col = f"mean_{element}" + self._vins[element] = self._vins[element].fillna(self._vins[mean_col]) - """ - tmp = df[["Appellation", col]].copy() + self._vins = self._vins.drop(columns=["mean_" + element]) + return self - tmp[col] = to_numeric(tmp[col], errors="coerce") - - # moyenne par appellation - means = tmp.groupby("Appellation", as_index=False)[col].mean() - - means[col] = means[col].fillna(0) - - means = means.rename(columns={col: f"mean_{col}"}) - - -def mean_robert(df: DataFrame) -> DataFrame: - return mean_score(df, "Robert") - - -def mean_robinson(df: DataFrame) -> DataFrame: - return mean_score(df, "Robinson") - - -def mean_suckling(df: DataFrame) -> DataFrame: - return mean_score(df, "Suckling") - - -def fill_missing_scores(df: DataFrame) -> DataFrame: - """ - Remplacer les notes manquantes par la moyenne - des vins de la même appellation. - """ - df_copy = df.copy() - df_copy["Appellation"] = df_copy["Appellation"].astype(str).str.strip() - - for score in SCORE_COLS: - df_copy[score] = to_numeric(df_copy[score], errors="coerce") - - temp_cols: list[str] = [] - - for score in SCORE_COLS: - mean_df = mean_score(df_copy, score) - mean_name = f"mean_{score}" - temp_cols.append(mean_name) - - df_copy = df_copy.merge(mean_df, on="Appellation", how="left") - df_copy[score] = df_copy[score].fillna(df_copy[mean_name]) - - df_copy = df_copy.drop(columns=temp_cols) - return df_copy - - -def encode_appellation(df: DataFrame, column: str = "Appellation") -> DataFrame: - """ - Remplace la colonne 'Appellation' par des colonnes indicatrices - """ - df_copy = df.copy() - - appellations = df_copy[column].astype(str).str.strip() - - appellation_dummies = get_dummies(appellations) - - df_copy = df_copy.drop(columns=[column]) - - return df_copy.join(appellation_dummies) + def encode_appellation(self, column: str = "Appellation") -> Cleaning: + """ + Remplace la colonne 'Appellation' par des colonnes indicatrices + """ + appellations = self._vins[column].astype(str).str.strip() + appellation_dummies = get_dummies(appellations) + self._vins = self._vins.drop(columns=[column]) + self._vins = self._vins.join(appellation_dummies) + return self \ No newline at end of file diff --git a/tests/test_cleaning.py b/tests/test_cleaning.py index d376ccf..166c52c 100755 --- a/tests/test_cleaning.py +++ b/tests/test_cleaning.py @@ -1,64 +1,68 @@ -import pandas as pd import pytest from pandas import DataFrame - -from cleaning import ( - SCORE_COLS, - drop_empty_appellation, - mean_score, - fill_missing_scores, - encode_appellation, -) +from unittest.mock import patch, mock_open +from cleaning import Cleaning @pytest.fixture -def df_raw() -> DataFrame: - return pd.DataFrame({ - "Appellation": ["Pauillac", "Pauillac ", "Margaux", None, "Pomerol", "Pomerol"], - "Robert": ["95", None, "bad", 90, None, None], - "Robinson": [None, "93", 18, None, None, None], - "Suckling": [96, None, None, None, 91, None], - "Prix": ["10.0", "11.0", "20.0", "30.0", "40.0", "50.0"], - }) +def cleaning_raw() -> Cleaning: + """ + "Appellation": ["Pauillac", "Pauillac ", "Margaux", None , "Pomerol", "Pomerol"], + "Robert": ["95" , None , "bad" , 90 , None , None ], + "Robinson": [None , "93" , 18 , None , None , None ], + "Suckling": [96 , None , None , None , 91 , None ], + "Prix": ["10.0" , "11.0" , "20.0" , "30.0", "40.0" , "50.0" ], + """ + csv_content = """Appellation,Robert,Robinson,Suckling,Prix +Pauillac,95,,96,10.0 +Pauillac ,,93,,11.0 +Margaux,bad,18,,20.0 +,90,,,30.0 +Pomerol,,,91,40.0 +Pomerol,,,,50.0 +""" + m = mock_open(read_data=csv_content) + with patch("builtins.open", m): + return Cleaning("donnee.csv") -def test_drop_empty_appellation(df_raw: DataFrame): - out = drop_empty_appellation(df_raw) +def test_drop_empty_appellation(cleaning_raw: Cleaning) -> None: + out = cleaning_raw.drop_empty_appellation().getVins() assert out["Appellation"].isna().sum() == 0 - assert len(out) == 5 + assert len(out) == 5 -def test_mean_score_zero_when_no_scores(df_raw: DataFrame): - out = drop_empty_appellation(df_raw) - m = mean_score(out, "Robert") +def test_mean_score_zero_when_no_scores(cleaning_raw: Cleaning) -> None: + out = cleaning_raw.drop_empty_appellation() + m = out._mean_score("Robert") assert list(m.columns) == ["Appellation", "mean_Robert"] - - # Pomerol n'a aucune note Robert => moyenne doit être 0 - pomerol_mean = m.loc[m["Appellation"].str.strip() == "Pomerol", "mean_Robert"].iloc[0] + pomerol_mean = m.loc[m["Appellation"].str.strip() == "Pomerol", "mean_Robert"].iloc[ + 0 + ] assert pomerol_mean == 0 -def test_fill_missing_scores(df_raw: DataFrame): - out = drop_empty_appellation(df_raw) - filled = fill_missing_scores(out) +def test_fill_missing_scores(cleaning_raw: Cleaning): + cleaning_raw._vins["Appellation"] = cleaning_raw._vins["Appellation"].str.strip() - # plus de NaN dans les colonnes de scores - for col in SCORE_COLS: + cleaning_raw.drop_empty_appellation() + filled = cleaning_raw.fill_missing_scores().getVins() + for col in cleaning_raw.SCORE_COLS: assert filled[col].isna().sum() == 0 - - assert filled.loc[1, "Robert"] == 95.0 - - # pas de colonnes temporaires mean_* - for col in SCORE_COLS: - assert f"mean_{col}" not in filled.columns + + pauillac_robert = filled[filled["Appellation"] == "Pauillac"]["Robert"] + assert (pauillac_robert == 95.0).all() -def test_encode_appellation(df_raw: DataFrame): - out = drop_empty_appellation(df_raw) - filled = fill_missing_scores(out) - encoded = encode_appellation(filled) +def test_encode_appellation(cleaning_raw: Cleaning): + cleaning_raw._vins["Appellation"] = cleaning_raw._vins["Appellation"].str.strip() - # la colonne texte disparaît - assert "Appellation" not in encoded.columns - assert "Pauillac" in encoded.columns - assert encoded.loc[0, "Pauillac"] == 1 \ No newline at end of file + out = ( + cleaning_raw.drop_empty_appellation() + .fill_missing_scores() + .encode_appellation() + .getVins() + ) + assert "Appellation" not in out.columns + assert "Pauillac" in out.columns + assert int(out.loc[0, "Pauillac"]) == 1