diff --git a/pyproject.toml b/pyproject.toml index 6d14d59..e638cbf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,12 @@ [project] name = "projet-millesima-s6" version = "0.1.0" -dependencies = ["requests==2.32.5", "beautifulsoup4==4.14.3", "pandas==2.3.3", "tqdm==4.67.3"] +dependencies = [ + "requests==2.32.5", + "beautifulsoup4==4.14.3", + "pandas==2.3.3", + "tqdm==4.67.3", +] [project.optional-dependencies] test = ["pytest==8.4.2", "requests-mock==1.12.1", "flake8==7.3.0"] diff --git a/src/cleaning.py b/src/cleaning.py index fcaabdb..1f3f788 100644 --- a/src/cleaning.py +++ b/src/cleaning.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -from pandas import DataFrame, to_numeric -import pandas as pd +from pandas import DataFrame, to_numeric, get_dummies SCORE_COLS = ["Robert", "Robinson", "Suckling"] @@ -37,7 +36,7 @@ def mean_score(df: DataFrame, col: str) -> DataFrame: Calcule la moyenne d'une colonne de score par appellation. - Convertit les valeurs en numériques, en remplaçant les non-convertibles par NaN - Calcule la moyenne par appellation - - Remplace les NaN résultants par 0 + - Remplace les NaN résultants par 0 """ tmp = df[["Appellation", col]].copy() @@ -46,12 +45,10 @@ def mean_score(df: DataFrame, col: str) -> DataFrame: # moyenne par appellation means = tmp.groupby("Appellation", as_index=False)[col].mean() - + means[col] = means[col].fillna(0) - + means = means.rename(columns={col: f"mean_{col}"}) - - return means def mean_robert(df: DataFrame) -> DataFrame: @@ -96,10 +93,10 @@ def encode_appellation(df: DataFrame, column: str = "Appellation") -> DataFrame: Remplace la colonne 'Appellation' par des colonnes indicatrices """ df_copy = df.copy() - + appellations = df_copy[column].astype(str).str.strip() - appellation_dummies = pd.get_dummies(appellations) + appellation_dummies = get_dummies(appellations) df_copy = df_copy.drop(columns=[column]) diff --git a/src/main.py b/src/main.py index 65cbd62..512fe20 100755 --- a/src/main.py +++ b/src/main.py @@ -5,13 +5,7 @@ from os.path import normpath, join from sys import argv from pandas import read_csv, DataFrame -from cleaning import (display_info, - drop_empty_appellation, - mean_robert, - mean_robinson, - mean_suckling, - fill_missing_scores, - encode_appellation) +from cleaning import * def load_csv(filename: str) -> DataFrame: diff --git a/tests/test_cleaning.py b/tests/test_cleaning.py old mode 100644 new mode 100755 diff --git a/tests/test_scraper.py b/tests/test_scraper.py old mode 100644 new mode 100755