diff --git a/cleaning.py b/cleaning.py new file mode 100644 index 0000000..b7c66a4 --- /dev/null +++ b/cleaning.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +from pandas import DataFrame, to_numeric + + +def display_info(df: DataFrame) -> None: + print(df.all()) + print(df.info()) + print("\nNombre de valeurs manquantes par colonne :") + print(df.isna().sum()) + + +def drop_empty_appellation(df: DataFrame) -> DataFrame: + + return df.dropna(subset=["Appellation"]) + + +def mean_score(df: DataFrame, col: str) -> DataFrame: + """ + Calcule la moyenne d'une colonne de score par appellation. + - Convertit les valeurs en numériques, en remplaçant les non-convertibles par NaN + - Calcule la moyenne par appellation + - Remplace les NaN résultants par 0 + + """ + tmp = df[["Appellation", col]].copy() + + tmp[col] = to_numeric(tmp[col], errors="coerce") + + # moyenne par appellation + means = tmp.groupby("Appellation", as_index=False)[col].mean() + + means[col] = means[col].fillna(0) + + means = means.rename(columns={col: f"mean_{col}"}) + + return means + + +def mean_robert(df: DataFrame) -> DataFrame: + return mean_score(df, "Robert") + + +def mean_robinson(df: DataFrame) -> DataFrame: + return mean_score(df, "Robinson") + + +def mean_suckling(df: DataFrame) -> DataFrame: + return mean_score(df, "Suckling") \ No newline at end of file diff --git a/main.py b/main.py index 5747f74..a87f052 100755 --- a/main.py +++ b/main.py @@ -5,19 +5,53 @@ from os.path import normpath, join from sys import argv from pandas import read_csv, DataFrame +from cleaning import (display_info, + drop_empty_appellation, + mean_robert, + mean_robinson, + mean_suckling) + + +def load_csv(filename: str) -> DataFrame: + path: str = normpath(join(getcwd(), filename)) + return read_csv(path) + + +def save_csv(df: DataFrame, out_filename: str) -> None: + df.to_csv(out_filename, index=False) + + def main() -> None: if len(argv) != 2: - raise ValueError(f"{argv[0]} ") + raise ValueError(f"Usage: {argv[0]} ") + + df = load_csv(argv[1]) + + print("=== Avant nettoyage ===") + display_info(df) + + df = drop_empty_appellation(df) + save_csv(df, "donnee_clean.csv") + + print("\n=== Après nettoyage d'appellations manquantes ===") + display_info(df) + + #la moyenne des notes des vins pour chaque appellation + robert_means = mean_robert(df) + save_csv(robert_means, "mean_robert_by_appellation.csv") + print("\n=== moyenne Robert par appellation ===") + print(robert_means.head(10)) + + robinson_means = mean_robinson(df) + save_csv(robinson_means, "mean_robinson_by_appellation.csv") + print("\n===: moyennes Robinson par appellation ===") + print(robinson_means.head(10)) + + suckling_means = mean_suckling(df) + save_csv(suckling_means, "mean_suckling_by_appellation.csv") + print("\n===: moyennes Suckling par appellation ===") + print(suckling_means.head(10)) - path: str = normpath(join(getcwd(), argv[1])) - db: DataFrame = read_csv(path) - print(db.all()) - print(db.info()) - print("\nnombre de valeurs manquantes pour chaque colonne :") - print(db.isna().sum()) - db = db.dropna(subset=["Appellation"]) - db.to_csv("donnee_clean.csv", index=False) - print(db.isna().sum()) if __name__ == "__main__": try: