diff --git a/src/cleaning.py b/src/cleaning.py new file mode 100644 index 0000000..fcaabdb --- /dev/null +++ b/src/cleaning.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +from pandas import DataFrame, to_numeric +import pandas as pd + +SCORE_COLS = ["Robert", "Robinson", "Suckling"] + + +def display_info(df: DataFrame, name: str = "DataFrame") -> None: + """ + Affiche un résumé du DataFrame + -la taille + -types des colonnes + -valeurs manquantes + -statistiques numériques + """ + print(f"\n===== {name} =====") + + print(f"Shape : {df.shape[0]} lignes × {df.shape[1]} colonnes") + + print("\nTypes des colonnes :") + print(df.dtypes) + + print("\nValeurs manquantes :") + print(df.isna().sum()) + + print("\nStatistiques numériques :") + print(df.describe().round(2)) + + +def drop_empty_appellation(df: DataFrame) -> DataFrame: + + return df.dropna(subset=["Appellation"]) + + +def mean_score(df: DataFrame, col: str) -> DataFrame: + """ + Calcule la moyenne d'une colonne de score par appellation. + - Convertit les valeurs en numériques, en remplaçant les non-convertibles par NaN + - Calcule la moyenne par appellation + - Remplace les NaN résultants par 0 + + """ + tmp = df[["Appellation", col]].copy() + + tmp[col] = to_numeric(tmp[col], errors="coerce") + + # moyenne par appellation + means = tmp.groupby("Appellation", as_index=False)[col].mean() + + means[col] = means[col].fillna(0) + + means = means.rename(columns={col: f"mean_{col}"}) + + return means + + +def mean_robert(df: DataFrame) -> DataFrame: + return mean_score(df, "Robert") + + +def mean_robinson(df: DataFrame) -> DataFrame: + return mean_score(df, "Robinson") + + +def mean_suckling(df: DataFrame) -> DataFrame: + return mean_score(df, "Suckling") + + +def fill_missing_scores(df: DataFrame) -> DataFrame: + """ + Remplacer les notes manquantes par la moyenne + des vins de la même appellation. + """ + df_copy = df.copy() + df_copy["Appellation"] = df_copy["Appellation"].astype(str).str.strip() + + for score in SCORE_COLS: + df_copy[score] = to_numeric(df_copy[score], errors="coerce") + + temp_cols: list[str] = [] + + for score in SCORE_COLS: + mean_df = mean_score(df_copy, score) + mean_name = f"mean_{score}" + temp_cols.append(mean_name) + + df_copy = df_copy.merge(mean_df, on="Appellation", how="left") + df_copy[score] = df_copy[score].fillna(df_copy[mean_name]) + + df_copy = df_copy.drop(columns=temp_cols) + return df_copy + + +def encode_appellation(df: DataFrame, column: str = "Appellation") -> DataFrame: + """ + Remplace la colonne 'Appellation' par des colonnes indicatrices + """ + df_copy = df.copy() + + appellations = df_copy[column].astype(str).str.strip() + + appellation_dummies = pd.get_dummies(appellations) + + df_copy = df_copy.drop(columns=[column]) + + return df_copy.join(appellation_dummies) diff --git a/src/main.py b/src/main.py new file mode 100755 index 0000000..65cbd62 --- /dev/null +++ b/src/main.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 + +from os import getcwd +from os.path import normpath, join +from sys import argv +from pandas import read_csv, DataFrame + +from cleaning import (display_info, + drop_empty_appellation, + mean_robert, + mean_robinson, + mean_suckling, + fill_missing_scores, + encode_appellation) + + +def load_csv(filename: str) -> DataFrame: + path: str = normpath(join(getcwd(), filename)) + return read_csv(path) + + +def save_csv(df: DataFrame, out_filename: str) -> None: + df.to_csv(out_filename, index=False) + + +def main() -> None: + if len(argv) != 2: + raise ValueError(f"Usage: {argv[0]} ") + + df = load_csv(argv[1]) + + display_info(df, "Avant le nettoyage") + + df = drop_empty_appellation(df) + save_csv(df, "donnee_clean.csv") + display_info(df, "Après nettoyage d'appellations manquantes") + + #la moyenne des notes des vins pour chaque appellation + robert_means = mean_robert(df) + save_csv(robert_means, "mean_robert_by_appellation.csv") + display_info(robert_means, "Moyennes Robert par appellation") + + robinson_means = mean_robinson(df) + save_csv(robinson_means, "mean_robinson_by_appellation.csv") + display_info(robinson_means, "Moyennes Robinson par appellation") + + suckling_means = mean_suckling(df) + save_csv(suckling_means, "mean_suckling_by_appellation.csv") + display_info(suckling_means, "Moyennes Suckling par appellation") + + df_missing_scores = fill_missing_scores(df) + save_csv(df_missing_scores, "donnee_filled.csv") + display_info(df_missing_scores, "Après remplissage des notes manquantes par la moyenne de l'appellation") + + df_ready = encode_appellation(df_missing_scores) + save_csv(df_ready, "donnee_ready.csv") + display_info(df_ready, "Après remplacer la colonne 'Appellation' par des colonnes indicatrices") + + +if __name__ == "__main__": + try: + main() + except Exception as e: + print(f"ERREUR: {e}") \ No newline at end of file diff --git a/src/scraper.py b/src/scraper.py index 2a11571..5f45d30 100755 --- a/src/scraper.py +++ b/src/scraper.py @@ -215,6 +215,7 @@ class _ScraperData: robinson = self.robinson() suckling = self.suckling() prix = self.prix() + prix = self.prix() return f"{appellation},{parker},{robinson},{suckling},{prix}" diff --git a/tests/test_cleaning.py b/tests/test_cleaning.py new file mode 100644 index 0000000..d376ccf --- /dev/null +++ b/tests/test_cleaning.py @@ -0,0 +1,64 @@ +import pandas as pd +import pytest +from pandas import DataFrame + +from cleaning import ( + SCORE_COLS, + drop_empty_appellation, + mean_score, + fill_missing_scores, + encode_appellation, +) + + +@pytest.fixture +def df_raw() -> DataFrame: + return pd.DataFrame({ + "Appellation": ["Pauillac", "Pauillac ", "Margaux", None, "Pomerol", "Pomerol"], + "Robert": ["95", None, "bad", 90, None, None], + "Robinson": [None, "93", 18, None, None, None], + "Suckling": [96, None, None, None, 91, None], + "Prix": ["10.0", "11.0", "20.0", "30.0", "40.0", "50.0"], + }) + + +def test_drop_empty_appellation(df_raw: DataFrame): + out = drop_empty_appellation(df_raw) + assert out["Appellation"].isna().sum() == 0 + assert len(out) == 5 + + +def test_mean_score_zero_when_no_scores(df_raw: DataFrame): + out = drop_empty_appellation(df_raw) + m = mean_score(out, "Robert") + assert list(m.columns) == ["Appellation", "mean_Robert"] + + # Pomerol n'a aucune note Robert => moyenne doit être 0 + pomerol_mean = m.loc[m["Appellation"].str.strip() == "Pomerol", "mean_Robert"].iloc[0] + assert pomerol_mean == 0 + + +def test_fill_missing_scores(df_raw: DataFrame): + out = drop_empty_appellation(df_raw) + filled = fill_missing_scores(out) + + # plus de NaN dans les colonnes de scores + for col in SCORE_COLS: + assert filled[col].isna().sum() == 0 + + assert filled.loc[1, "Robert"] == 95.0 + + # pas de colonnes temporaires mean_* + for col in SCORE_COLS: + assert f"mean_{col}" not in filled.columns + + +def test_encode_appellation(df_raw: DataFrame): + out = drop_empty_appellation(df_raw) + filled = fill_missing_scores(out) + encoded = encode_appellation(filled) + + # la colonne texte disparaît + assert "Appellation" not in encoded.columns + assert "Pauillac" in encoded.columns + assert encoded.loc[0, "Pauillac"] == 1 \ No newline at end of file