4 Commits

5 changed files with 190 additions and 230 deletions

View File

@@ -36,10 +36,3 @@ jobs:
- name: Test with pytest - name: Test with pytest
run: pytest run: pytest
- name: Deploy Doc
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
run: |
git config user.name github-actions
git config user.email github-actions@github.com
mkdocs gh-deploy --force

144
src/cleaning.py Normal file → Executable file
View File

@@ -1,37 +1,50 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from pandas import DataFrame, to_numeric, get_dummies
SCORE_COLS = ["Robert", "Robinson", "Suckling"] from os import getcwd
from os.path import normpath, join
from typing import cast
from pandas import DataFrame, read_csv, to_numeric, get_dummies
from sys import argv
def display_info(df: DataFrame, name: str = "DataFrame") -> None: def path_filename(filename: str) -> str:
return normpath(join(getcwd(), filename))
class Cleaning:
def __init__(self, filename) -> None:
self._vins: DataFrame = read_csv(filename)
# créer la liste de tout les scores
self.SCORE_COLS: list[str] = [
c for c in self._vins.columns if c not in ["Appellation", "Prix"]
]
# transforme tout les colonnes score en numérique
for col in self.SCORE_COLS:
self._vins[col] = to_numeric(self._vins[col], errors="coerce")
def getVins(self) -> DataFrame:
return self._vins.copy(deep=True)
def __str__(self) -> str:
""" """
Affiche un résumé du DataFrame Affiche un résumé du DataFrame
-la taille - la taille
-types des colonnes - types des colonnes
-valeurs manquantes - valeurs manquantes
-statistiques numériques - statistiques numériques
""" """
print(f"\n===== {name} =====") return (
f"Shape : {self._vins.shape[0]} lignes x {self._vins.shape[1]} colonnes\n\n"
f"Types des colonnes :\n{self._vins.dtypes}\n\n"
f"Valeurs manquantes :\n{self._vins.isna().sum()}\n\n"
f"Statistiques numériques :\n{self._vins.describe().round(2)}\n\n"
)
print(f"Shape : {df.shape[0]} lignes × {df.shape[1]} colonnes") def drop_empty_appellation(self) -> "Cleaning":
self._vins = self._vins.dropna(subset=["Appellation"])
return self
print("\nTypes des colonnes :") def _mean_score(self, col: str) -> DataFrame:
print(df.dtypes)
print("\nValeurs manquantes :")
print(df.isna().sum())
print("\nStatistiques numériques :")
print(df.describe().round(2))
def drop_empty_appellation(df: DataFrame) -> DataFrame:
return df.dropna(subset=["Appellation"])
def mean_score(df: DataFrame, col: str) -> DataFrame:
""" """
Calcule la moyenne d'une colonne de score par appellation. Calcule la moyenne d'une colonne de score par appellation.
- Convertit les valeurs en numériques, en remplaçant les non-convertibles par NaN - Convertit les valeurs en numériques, en remplaçant les non-convertibles par NaN
@@ -39,65 +52,58 @@ def mean_score(df: DataFrame, col: str) -> DataFrame:
- Remplace les NaN résultants par 0 - Remplace les NaN résultants par 0
""" """
tmp = df[["Appellation", col]].copy() means = self._vins.groupby("Appellation", as_index=False)[col].mean()
means = means.rename(
columns={col: f"mean_{col}"}
) # pyright: ignore[reportCallIssue]
return cast(DataFrame, means.fillna(0))
tmp[col] = to_numeric(tmp[col], errors="coerce") def _mean_robert(self) -> DataFrame:
return self._mean_score("Robert")
# moyenne par appellation def _mean_robinson(self) -> DataFrame:
means = tmp.groupby("Appellation", as_index=False)[col].mean() return self._mean_score("Robinson")
means[col] = means[col].fillna(0) def _mean_suckling(self) -> DataFrame:
return self._mean_score("Suckling")
means = means.rename(columns={col: f"mean_{col}"}) def fill_missing_scores(self) -> "Cleaning":
def mean_robert(df: DataFrame) -> DataFrame:
return mean_score(df, "Robert")
def mean_robinson(df: DataFrame) -> DataFrame:
return mean_score(df, "Robinson")
def mean_suckling(df: DataFrame) -> DataFrame:
return mean_score(df, "Suckling")
def fill_missing_scores(df: DataFrame) -> DataFrame:
""" """
Remplacer les notes manquantes par la moyenne Remplacer les notes manquantes par la moyenne
des vins de la même appellation. des vins de la même appellation.
""" """
df_copy = df.copy() for element in self.SCORE_COLS:
df_copy["Appellation"] = df_copy["Appellation"].astype(str).str.strip() means = self._mean_score(element)
self._vins = self._vins.merge(means, on="Appellation", how="left")
for score in SCORE_COLS: mean_col = f"mean_{element}"
df_copy[score] = to_numeric(df_copy[score], errors="coerce") self._vins[element] = self._vins[element].fillna(self._vins[mean_col])
temp_cols: list[str] = [] self._vins = self._vins.drop(columns=["mean_" + element])
return self
for score in SCORE_COLS: def encode_appellation(self, column: str = "Appellation") -> "Cleaning":
mean_df = mean_score(df_copy, score)
mean_name = f"mean_{score}"
temp_cols.append(mean_name)
df_copy = df_copy.merge(mean_df, on="Appellation", how="left")
df_copy[score] = df_copy[score].fillna(df_copy[mean_name])
df_copy = df_copy.drop(columns=temp_cols)
return df_copy
def encode_appellation(df: DataFrame, column: str = "Appellation") -> DataFrame:
""" """
Remplace la colonne 'Appellation' par des colonnes indicatrices Remplace la colonne 'Appellation' par des colonnes indicatrices
""" """
df_copy = df.copy() appellations = self._vins[column].astype(str).str.strip()
appellation_dummies = get_dummies(appellations, prefix="App")
self._vins = self._vins.drop(columns=[column])
self._vins = self._vins.join(appellation_dummies)
return self
appellations = df_copy[column].astype(str).str.strip()
appellation_dummies = get_dummies(appellations) def main() -> None:
if len(argv) != 2:
raise ValueError(f"Usage: {argv[0]} <filename.csv>")
df_copy = df_copy.drop(columns=[column]) filename = argv[1]
cleaning: Cleaning = Cleaning(filename)
_ = cleaning.drop_empty_appellation().fill_missing_scores().encode_appellation()
return df_copy.join(appellation_dummies)
if __name__ == "__main__":
try:
main()
except Exception as e:
print(f"ERREUR: {e}")

View File

@@ -1,58 +0,0 @@
#!/usr/bin/env python3
from os import getcwd
from os.path import normpath, join
from sys import argv
from pandas import read_csv, DataFrame
from cleaning import *
def load_csv(filename: str) -> DataFrame:
path: str = normpath(join(getcwd(), filename))
return read_csv(path)
def save_csv(df: DataFrame, out_filename: str) -> None:
df.to_csv(out_filename, index=False)
def main() -> None:
if len(argv) != 2:
raise ValueError(f"Usage: {argv[0]} <filename.csv>")
df = load_csv(argv[1])
display_info(df, "Avant le nettoyage")
df = drop_empty_appellation(df)
save_csv(df, "donnee_clean.csv")
display_info(df, "Après nettoyage d'appellations manquantes")
#la moyenne des notes des vins pour chaque appellation
robert_means = mean_robert(df)
save_csv(robert_means, "mean_robert_by_appellation.csv")
display_info(robert_means, "Moyennes Robert par appellation")
robinson_means = mean_robinson(df)
save_csv(robinson_means, "mean_robinson_by_appellation.csv")
display_info(robinson_means, "Moyennes Robinson par appellation")
suckling_means = mean_suckling(df)
save_csv(suckling_means, "mean_suckling_by_appellation.csv")
display_info(suckling_means, "Moyennes Suckling par appellation")
df_missing_scores = fill_missing_scores(df)
save_csv(df_missing_scores, "donnee_filled.csv")
display_info(df_missing_scores, "Après remplissage des notes manquantes par la moyenne de l'appellation")
df_ready = encode_appellation(df_missing_scores)
save_csv(df_ready, "donnee_ready.csv")
display_info(df_ready, "Après remplacer la colonne 'Appellation' par des colonnes indicatrices")
if __name__ == "__main__":
try:
main()
except Exception as e:
print(f"ERREUR: {e}")

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from collections import OrderedDict from collections import OrderedDict
from io import SEEK_END, SEEK_SET, BufferedWriter from io import SEEK_END, SEEK_SET, BufferedWriter, TextIOWrapper
from json import JSONDecodeError, loads from json import JSONDecodeError, loads
from os import makedirs from os import makedirs
from os.path import dirname, exists, join, normpath, realpath from os.path import dirname, exists, join, normpath, realpath
@@ -407,6 +407,44 @@ class Scraper:
except (JSONDecodeError, HTTPError) as e: except (JSONDecodeError, HTTPError) as e:
print(f"Erreur sur le produit {link}: {e}") print(f"Erreur sur le produit {link}: {e}")
def _initstate(self, reset: bool) -> tuple[int, set[str]]:
"""
appelle la fonction pour load le cache, si il existe
pas, il utilise les variables de base sinon il override
toute les variables pour continuer et pas recommencer le
processus en entier.
Args:
reset (bool): pouvoir le reset ou pas
Returns:
tuple[int, set[str]]: le contenu de la page et du cache
"""
if not reset:
#
serializable: tuple[int, set[str]] | None = loadstate()
if isinstance(serializable, tuple):
return serializable
return 1, set()
def _ensuretitle(self, f: TextIOWrapper, title: str) -> None:
"""
check si le titre est bien présent au début du buffer
sinon il l'ecrit, petit bug potentiel, a+ ecrit tout le
temps a la fin du buffer, si on a ecrit des choses avant
le titre sera apres ces données mais on part du principe
que personne va toucher le fichier.
Args:
f (TextIOWrapper): buffer stream fichier
title (str): titre du csv
"""
_ = f.seek(0, SEEK_SET)
if not (f.read(len(title)) == title):
_ = f.write(title)
else:
_ = f.seek(0, SEEK_END)
def getvins(self, subdir: str, filename: str, reset: bool = False) -> None: def getvins(self, subdir: str, filename: str, reset: bool = False) -> None:
""" """
Scrape toutes les pages d'une catégorie et sauvegarde en CSV. Scrape toutes les pages d'une catégorie et sauvegarde en CSV.
@@ -420,35 +458,13 @@ class Scraper:
mode: Literal["w", "a+"] = "w" if reset else "a+" mode: Literal["w", "a+"] = "w" if reset else "a+"
# titre # titre
title: str = "Appellation,Robert,Robinson,Suckling,Prix\n" title: str = "Appellation,Robert,Robinson,Suckling,Prix\n"
# page du début # page: page où commence le scraper
page: int = 1 # cache: tout les pages déjà parcourir
# le set qui sert de cache page, cache = self._initstate(reset)
cache: set[str] = set[str]()
custom_format = "{l_bar} {bar:20} {r_bar}"
if not reset:
# appelle la fonction pour load le cache, si il existe
# pas, il utilise les variables de base sinon il override
# toute les variables pour continuer et pas recommencer le
# processus en entier.
serializable: tuple[int, set[str]] | None = loadstate()
if isinstance(serializable, tuple):
# override la page et le cache
page, cache = serializable
try: try:
with open(filename, mode) as f: with open(filename, mode) as f:
# check si le titre est bien présent au début du buffer self._ensuretitle(f, title)
# sinon il l'ecrit, petit bug potentiel, a+ ecrit tout le
# temps a la fin du buffer, si on a ecrit des choses avant
# le titre sera apres ces données mais on part du principe
# que personne va toucher le fichier.
_ = f.seek(0, SEEK_SET)
if not (f.read(len(title)) == title):
_ = f.write(title)
else:
_ = f.seek(0, SEEK_END)
while True: while True:
products_list: list[dict[str, Any]] | None = ( products_list: list[dict[str, Any]] | None = (
self._geturlproductslist(f"{subdir}?page={page}") self._geturlproductslist(f"{subdir}?page={page}")
@@ -457,7 +473,7 @@ class Scraper:
break break
pbar: tqdm[dict[str, Any]] = tqdm( pbar: tqdm[dict[str, Any]] = tqdm(
products_list, bar_format=custom_format products_list, bar_format="{l_bar} {bar:20} {r_bar}"
) )
for product in pbar: for product in pbar:
keyword: str = cast( keyword: str = cast(

View File

@@ -1,64 +1,67 @@
import pandas as pd
import pytest import pytest
from pandas import DataFrame from unittest.mock import patch, mock_open
from cleaning import Cleaning
from cleaning import (
SCORE_COLS,
drop_empty_appellation,
mean_score,
fill_missing_scores,
encode_appellation,
)
@pytest.fixture @pytest.fixture
def df_raw() -> DataFrame: def cleaning_raw() -> Cleaning:
return pd.DataFrame({ """
"Appellation": ["Pauillac", "Pauillac ", "Margaux", None, "Pomerol", "Pomerol"], "Appellation": ["Pauillac", "Pauillac ", "Margaux", None , "Pomerol", "Pomerol"],
"Robert": ["95", None, "bad", 90, None, None], "Robert": ["95" , None , "bad" , 90 , None , None ],
"Robinson": [None, "93", 18, None, None, None], "Robinson": [None , "93" , 18 , None , None , None ],
"Suckling": [96, None, None, None, 91, None], "Suckling": [96 , None , None , None , 91 , None ],
"Prix": ["10.0", "11.0", "20.0", "30.0", "40.0", "50.0"], "Prix": ["10.0" , "11.0" , "20.0" , "30.0", "40.0" , "50.0" ],
}) """
csv_content = """Appellation,Robert,Robinson,Suckling,Prix
Pauillac,95,,96,10.0
Pauillac ,,93,,11.0
Margaux,bad,18,,20.0
,90,,,30.0
Pomerol,,,91,40.0
Pomerol,,,,50.0
"""
m = mock_open(read_data=csv_content)
with patch("builtins.open", m):
return Cleaning("donnee.csv")
def test_drop_empty_appellation(df_raw: DataFrame): def test_drop_empty_appellation(cleaning_raw: Cleaning) -> None:
out = drop_empty_appellation(df_raw) out = cleaning_raw.drop_empty_appellation().getVins()
assert out["Appellation"].isna().sum() == 0 assert out["Appellation"].isna().sum() == 0
assert len(out) == 5 assert len(out) == 5
def test_mean_score_zero_when_no_scores(df_raw: DataFrame): def test_mean_score_zero_when_no_scores(cleaning_raw: Cleaning) -> None:
out = drop_empty_appellation(df_raw) out = cleaning_raw.drop_empty_appellation()
m = mean_score(out, "Robert") m = out._mean_score("Robert")
assert list(m.columns) == ["Appellation", "mean_Robert"] assert list(m.columns) == ["Appellation", "mean_Robert"]
pomerol_mean = m.loc[m["Appellation"].str.strip() == "Pomerol", "mean_Robert"].iloc[
# Pomerol n'a aucune note Robert => moyenne doit être 0 0
pomerol_mean = m.loc[m["Appellation"].str.strip() == "Pomerol", "mean_Robert"].iloc[0] ]
assert pomerol_mean == 0 assert pomerol_mean == 0
def test_fill_missing_scores(df_raw: DataFrame): def test_fill_missing_scores(cleaning_raw: Cleaning):
out = drop_empty_appellation(df_raw) cleaning_raw._vins["Appellation"] = cleaning_raw._vins["Appellation"].str.strip()
filled = fill_missing_scores(out)
# plus de NaN dans les colonnes de scores cleaning_raw.drop_empty_appellation()
for col in SCORE_COLS: filled = cleaning_raw.fill_missing_scores().getVins()
for col in cleaning_raw.SCORE_COLS:
assert filled[col].isna().sum() == 0 assert filled[col].isna().sum() == 0
assert filled.loc[1, "Robert"] == 95.0 pauillac_robert = filled[filled["Appellation"] == "Pauillac"]["Robert"]
assert (pauillac_robert == 95.0).all()
# pas de colonnes temporaires mean_*
for col in SCORE_COLS:
assert f"mean_{col}" not in filled.columns
def test_encode_appellation(df_raw: DataFrame): def test_encode_appellation(cleaning_raw: Cleaning):
out = drop_empty_appellation(df_raw) cleaning_raw._vins["Appellation"] = cleaning_raw._vins["Appellation"].str.strip()
filled = fill_missing_scores(out)
encoded = encode_appellation(filled)
# la colonne texte disparaît out = (
assert "Appellation" not in encoded.columns cleaning_raw.drop_empty_appellation()
assert "Pauillac" in encoded.columns .fill_missing_scores()
assert encoded.loc[0, "Pauillac"] == 1 .encode_appellation()
.getVins()
)
assert "App_Appellation" not in out.columns
assert "App_Pauillac" in out.columns
assert int(out.loc[0, "App_Pauillac"]) == 1