diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 89ac80e..20b970f 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -36,10 +36,3 @@ jobs: - name: Test with pytest run: pytest - - - name: Deploy Doc - if: github.event_name == 'push' && github.ref == 'refs/heads/main' - run: | - git config user.name github-actions - git config user.email github-actions@github.com - mkdocs gh-deploy --force diff --git a/pyproject.toml b/pyproject.toml index 6d14d59..e638cbf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,12 @@ [project] name = "projet-millesima-s6" version = "0.1.0" -dependencies = ["requests==2.32.5", "beautifulsoup4==4.14.3", "pandas==2.3.3", "tqdm==4.67.3"] +dependencies = [ + "requests==2.32.5", + "beautifulsoup4==4.14.3", + "pandas==2.3.3", + "tqdm==4.67.3", +] [project.optional-dependencies] test = ["pytest==8.4.2", "requests-mock==1.12.1", "flake8==7.3.0"] diff --git a/src/cleaning.py b/src/cleaning.py new file mode 100755 index 0000000..4bf78bb --- /dev/null +++ b/src/cleaning.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 + +from os import getcwd +from os.path import normpath, join +from typing import cast +from pandas import DataFrame, read_csv, to_numeric, get_dummies +from sys import argv + + +def path_filename(filename: str) -> str: + return normpath(join(getcwd(), filename)) + + +class Cleaning: + def __init__(self, filename) -> None: + self._vins: DataFrame = read_csv(filename) + # + self.SCORE_COLS: list[str] = [ + c for c in self._vins.columns if c not in ["Appellation", "Prix"] + ] + # + for col in self.SCORE_COLS: + self._vins[col] = to_numeric(self._vins[col], errors="coerce") + + def getVins(self) -> DataFrame: + return self._vins.copy(deep=True) + + def __str__(self) -> str: + """ + Affiche un résumé du DataFrame + - la taille + - types des colonnes + - valeurs manquantes + - statistiques numériques + """ + return ( + f"Shape : {self._vins.shape[0]} lignes x {self._vins.shape[1]} colonnes\n\n" + f"Types des colonnes :\n{self._vins.dtypes}\n\n" + f"Valeurs manquantes :\n{self._vins.isna().sum()}\n\n" + f"Statistiques numériques :\n{self._vins.describe().round(2)}\n\n" + ) + + def drop_empty_appellation(self) -> "Cleaning": + self._vins = self._vins.dropna(subset=["Appellation"]) + return self + + def _mean_score(self, col: str) -> DataFrame: + """ + Calcule la moyenne d'une colonne de score par appellation. + - Convertit les valeurs en numériques, en remplaçant les non-convertibles par NaN + - Calcule la moyenne par appellation + - Remplace les NaN résultants par 0 + + """ + means = self._vins.groupby("Appellation", as_index=False)[col].mean() + means = means.rename( + columns={col: f"mean_{col}"} + ) # pyright: ignore[reportCallIssue] + return cast(DataFrame, means.fillna(0)) + + def _mean_robert(self) -> DataFrame: + return self._mean_score("Robert") + + def _mean_robinson(self) -> DataFrame: + return self._mean_score("Robinson") + + def _mean_suckling(self) -> DataFrame: + return self._mean_score("Suckling") + + def fill_missing_scores(self) -> "Cleaning": + """ + Remplacer les notes manquantes par la moyenne + des vins de la même appellation. + """ + for element in self.SCORE_COLS: + means = self._mean_score(element) + self._vins = self._vins.merge(means, on="Appellation", how="left") + + mean_col = f"mean_{element}" + self._vins[element] = self._vins[element].fillna(self._vins[mean_col]) + + self._vins = self._vins.drop(columns=["mean_" + element]) + return self + + def encode_appellation(self, column: str = "Appellation") -> "Cleaning": + """ + Remplace la colonne 'Appellation' par des colonnes indicatrices + """ + appellations = self._vins[column].astype(str).str.strip() + appellation_dummies = get_dummies(appellations) + self._vins = self._vins.drop(columns=[column]) + self._vins = self._vins.join(appellation_dummies) + return self + + +def main() -> None: + if len(argv) != 2: + raise ValueError(f"Usage: {argv[0]} ") + + filename = argv[1] + cleaning: Cleaning = Cleaning(filename) + _ = cleaning.drop_empty_appellation().fill_missing_scores().encode_appellation() + + +if __name__ == "__main__": + try: + main() + except Exception as e: + print(f"ERREUR: {e}") diff --git a/src/scraper.py b/src/scraper.py index 2a11571..76e4f6a 100755 --- a/src/scraper.py +++ b/src/scraper.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 from collections import OrderedDict -from io import SEEK_END, SEEK_SET, BufferedWriter +from io import SEEK_END, SEEK_SET, BufferedWriter, TextIOWrapper from json import JSONDecodeError, loads from os import makedirs from os.path import dirname, exists, join, normpath, realpath @@ -215,6 +215,7 @@ class _ScraperData: robinson = self.robinson() suckling = self.suckling() prix = self.prix() + prix = self.prix() return f"{appellation},{parker},{robinson},{suckling},{prix}" @@ -383,8 +384,7 @@ class Scraper: list[dict[str, Any]], data.get("products") ) - if isinstance(products, list): - return products + return products except (JSONDecodeError, HTTPError): return None @@ -407,6 +407,44 @@ class Scraper: except (JSONDecodeError, HTTPError) as e: print(f"Erreur sur le produit {link}: {e}") + def _initstate(self, reset: bool) -> tuple[int, set[str]]: + """ + appelle la fonction pour load le cache, si il existe + pas, il utilise les variables de base sinon il override + toute les variables pour continuer et pas recommencer le + processus en entier. + + Args: + reset (bool): pouvoir le reset ou pas + + Returns: + tuple[int, set[str]]: le contenu de la page et du cache + """ + if not reset: + # + serializable: tuple[int, set[str]] | None = loadstate() + if isinstance(serializable, tuple): + return serializable + return 1, set() + + def _ensuretitle(self, f: TextIOWrapper, title: str) -> None: + """ + check si le titre est bien présent au début du buffer + sinon il l'ecrit, petit bug potentiel, a+ ecrit tout le + temps a la fin du buffer, si on a ecrit des choses avant + le titre sera apres ces données mais on part du principe + que personne va toucher le fichier. + + Args: + f (TextIOWrapper): buffer stream fichier + title (str): titre du csv + """ + _ = f.seek(0, SEEK_SET) + if not (f.read(len(title)) == title): + _ = f.write(title) + else: + _ = f.seek(0, SEEK_END) + def getvins(self, subdir: str, filename: str, reset: bool = False) -> None: """ Scrape toutes les pages d'une catégorie et sauvegarde en CSV. @@ -420,35 +458,13 @@ class Scraper: mode: Literal["w", "a+"] = "w" if reset else "a+" # titre title: str = "Appellation,Robert,Robinson,Suckling,Prix\n" - # page du début - page: int = 1 - # le set qui sert de cache - cache: set[str] = set[str]() + # page: page où commence le scraper + # cache: tout les pages déjà parcourir + page, cache = self._initstate(reset) - custom_format = "{l_bar} {bar:20} {r_bar}" - - if not reset: - # appelle la fonction pour load le cache, si il existe - # pas, il utilise les variables de base sinon il override - # toute les variables pour continuer et pas recommencer le - # processus en entier. - serializable: tuple[int, set[str]] | None = loadstate() - if isinstance(serializable, tuple): - # override la page et le cache - page, cache = serializable try: with open(filename, mode) as f: - # check si le titre est bien présent au début du buffer - # sinon il l'ecrit, petit bug potentiel, a+ ecrit tout le - # temps a la fin du buffer, si on a ecrit des choses avant - # le titre sera apres ces données mais on part du principe - # que personne va toucher le fichier. - _ = f.seek(0, SEEK_SET) - if not (f.read(len(title)) == title): - _ = f.write(title) - else: - _ = f.seek(0, SEEK_END) - + self._ensuretitle(f, title) while True: products_list: list[dict[str, Any]] | None = ( self._geturlproductslist(f"{subdir}?page={page}") @@ -457,15 +473,21 @@ class Scraper: break pbar: tqdm[dict[str, Any]] = tqdm( - products_list, bar_format=custom_format + products_list, bar_format="{l_bar} {bar:20} {r_bar}" ) for product in pbar: - keyword = product.get("seoKeyword", "Inconnu")[:40] + keyword: str = cast( + str, product.get("seoKeyword", "Inconnu")[:40] + ) pbar.set_description( f"Page: {page:<3} | Product: {keyword:<40}" ) self._writevins(cache, product, f) page += 1 + # va créer un fichier au début et l'override + # tout les 5 pages au cas où SIGHUP ou autre + if page % 5 == 0 and not reset: + savestate((page, cache)) except (Exception, HTTPError, KeyboardInterrupt, JSONDecodeError): if not reset: savestate((page, cache)) diff --git a/tests/test_cleaning.py b/tests/test_cleaning.py new file mode 100755 index 0000000..28f42c6 --- /dev/null +++ b/tests/test_cleaning.py @@ -0,0 +1,67 @@ +import pytest +from unittest.mock import patch, mock_open +from cleaning import Cleaning + + +@pytest.fixture +def cleaning_raw() -> Cleaning: + """ + "Appellation": ["Pauillac", "Pauillac ", "Margaux", None , "Pomerol", "Pomerol"], + "Robert": ["95" , None , "bad" , 90 , None , None ], + "Robinson": [None , "93" , 18 , None , None , None ], + "Suckling": [96 , None , None , None , 91 , None ], + "Prix": ["10.0" , "11.0" , "20.0" , "30.0", "40.0" , "50.0" ], + """ + csv_content = """Appellation,Robert,Robinson,Suckling,Prix +Pauillac,95,,96,10.0 +Pauillac ,,93,,11.0 +Margaux,bad,18,,20.0 +,90,,,30.0 +Pomerol,,,91,40.0 +Pomerol,,,,50.0 +""" + m = mock_open(read_data=csv_content) + with patch("builtins.open", m): + return Cleaning("donnee.csv") + + +def test_drop_empty_appellation(cleaning_raw: Cleaning) -> None: + out = cleaning_raw.drop_empty_appellation().getVins() + assert out["Appellation"].isna().sum() == 0 + assert len(out) == 5 + + +def test_mean_score_zero_when_no_scores(cleaning_raw: Cleaning) -> None: + out = cleaning_raw.drop_empty_appellation() + m = out._mean_score("Robert") + assert list(m.columns) == ["Appellation", "mean_Robert"] + pomerol_mean = m.loc[m["Appellation"].str.strip() == "Pomerol", "mean_Robert"].iloc[ + 0 + ] + assert pomerol_mean == 0 + + +def test_fill_missing_scores(cleaning_raw: Cleaning): + cleaning_raw._vins["Appellation"] = cleaning_raw._vins["Appellation"].str.strip() + + cleaning_raw.drop_empty_appellation() + filled = cleaning_raw.fill_missing_scores().getVins() + for col in cleaning_raw.SCORE_COLS: + assert filled[col].isna().sum() == 0 + + pauillac_robert = filled[filled["Appellation"] == "Pauillac"]["Robert"] + assert (pauillac_robert == 95.0).all() + + +def test_encode_appellation(cleaning_raw: Cleaning): + cleaning_raw._vins["Appellation"] = cleaning_raw._vins["Appellation"].str.strip() + + out = ( + cleaning_raw.drop_empty_appellation() + .fill_missing_scores() + .encode_appellation() + .getVins() + ) + assert "Appellation" not in out.columns + assert "Pauillac" in out.columns + assert int(out.loc[0, "Pauillac"]) == 1 diff --git a/tests/test_scraper.py b/tests/test_scraper.py old mode 100644 new mode 100755