Merge pull request #10 from guezoloic/jalon2-loic

Jalon2 loic
2026-03-29 11:33:43 +00:00 · 2026-03-04 12:52:47 +01:00
6 changed files with 39 additions and 235 deletions
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -36,3 +36,10 @@ jobs:
      - name: Test with pytest
        run: pytest
      - name: Deploy Doc
        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
        run: |
          git config user.name github-actions
          git config user.email github-actions@github.com
          mkdocs gh-deploy --force
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,12 +1,7 @@
 [project]
 name = "projet-millesima-s6"
 version = "0.1.0"
-dependencies = [
+dependencies = ["requests==2.32.5", "beautifulsoup4==4.14.3", "pandas==2.3.3", "tqdm==4.67.3"]
    "requests==2.32.5",
    "beautifulsoup4==4.14.3",
    "pandas==2.3.3",
    "tqdm==4.67.3",
 ]
 [project.optional-dependencies]
 test = ["pytest==8.4.2", "requests-mock==1.12.1", "flake8==7.3.0"]
--- a/src/cleaning.py
+++ b/src/cleaning.py
@@ -1,109 +0,0 @@
 #!/usr/bin/env python3
 from os import getcwd
 from os.path import normpath, join
 from typing import cast
 from pandas import DataFrame, read_csv, to_numeric, get_dummies
 from sys import argv
 def path_filename(filename: str) -> str:
    return normpath(join(getcwd(), filename))
 class Cleaning:
    def __init__(self, filename) -> None:
        self._vins: DataFrame = read_csv(filename)
        # créer la liste de tout les scores
        self.SCORE_COLS: list[str] = [
            c for c in self._vins.columns if c not in ["Appellation", "Prix"]
        ]
        # transforme tout les colonnes score en numérique
        for col in self.SCORE_COLS:
            self._vins[col] = to_numeric(self._vins[col], errors="coerce")
    def getVins(self) -> DataFrame:
        return self._vins.copy(deep=True)
    def __str__(self) -> str:
        """
        Affiche un résumé du DataFrame
            - la taille
            - types des colonnes
            - valeurs manquantes
            - statistiques numériques
        """
        return (
            f"Shape : {self._vins.shape[0]} lignes x {self._vins.shape[1]} colonnes\n\n"
            f"Types des colonnes :\n{self._vins.dtypes}\n\n"
            f"Valeurs manquantes :\n{self._vins.isna().sum()}\n\n"
            f"Statistiques numériques :\n{self._vins.describe().round(2)}\n\n"
        )
    def drop_empty_appellation(self) -> "Cleaning":
        self._vins = self._vins.dropna(subset=["Appellation"])
        return self
    def _mean_score(self, col: str) -> DataFrame:
        """
        Calcule la moyenne d'une colonne de score par appellation.
            - Convertit les valeurs en numériques, en remplaçant les non-convertibles par NaN
            - Calcule la moyenne par appellation
            - Remplace les NaN résultants par 0
        """
        means = self._vins.groupby("Appellation", as_index=False)[col].mean()
        means = means.rename(
            columns={col: f"mean_{col}"}
        )  # pyright: ignore[reportCallIssue]
        return cast(DataFrame, means.fillna(0))
    def _mean_robert(self) -> DataFrame:
        return self._mean_score("Robert")
    def _mean_robinson(self) -> DataFrame:
        return self._mean_score("Robinson")
    def _mean_suckling(self) -> DataFrame:
        return self._mean_score("Suckling")
    def fill_missing_scores(self) -> "Cleaning":
        """
        Remplacer les notes manquantes par la moyenne
        des vins de la même appellation.
        """
        for element in self.SCORE_COLS:
            means = self._mean_score(element)
            self._vins = self._vins.merge(means, on="Appellation", how="left")
            mean_col = f"mean_{element}"
            self._vins[element] = self._vins[element].fillna(self._vins[mean_col])
            self._vins = self._vins.drop(columns=["mean_" + element])
        return self
    def encode_appellation(self, column: str = "Appellation") -> "Cleaning":
        """
        Remplace la colonne 'Appellation' par des colonnes indicatrices
        """
        appellations = self._vins[column].astype(str).str.strip()
        appellation_dummies = get_dummies(appellations, prefix="App")
        self._vins = self._vins.drop(columns=[column])
        self._vins = self._vins.join(appellation_dummies)
        return self
 def main() -> None:
    if len(argv) != 2:
        raise ValueError(f"Usage: {argv[0]} <filename.csv>")
    filename = argv[1]
    cleaning: Cleaning = Cleaning(filename)
    _ = cleaning.drop_empty_appellation().fill_missing_scores().encode_appellation()
 if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print(f"ERREUR: {e}")
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 from collections import OrderedDict
-from io import SEEK_END, SEEK_SET, BufferedWriter, TextIOWrapper
+from io import SEEK_END, SEEK_SET, BufferedWriter
 from json import JSONDecodeError, loads
 from os import makedirs
 from os.path import dirname, exists, join, normpath, realpath
@@ -215,7 +215,6 @@ class _ScraperData:
        robinson = self.robinson()
        suckling = self.suckling()
        prix = self.prix()
        prix = self.prix()
        return f"{appellation},{parker},{robinson},{suckling},{prix}"
@@ -384,7 +383,8 @@ class Scraper:
                list[dict[str, Any]], data.get("products")
            )
-            return products
+            if isinstance(products, list):
                return products
        except (JSONDecodeError, HTTPError):
            return None
@@ -407,44 +407,6 @@ class Scraper:
                except (JSONDecodeError, HTTPError) as e:
                    print(f"Erreur sur le produit {link}: {e}")
    def _initstate(self, reset: bool) -> tuple[int, set[str]]:
        """
        appelle la fonction pour load le cache, si il existe
        pas, il utilise les variables de base sinon il override
        toute les variables pour continuer et pas recommencer le
        processus en entier.
        Args:
            reset (bool): pouvoir le reset ou pas
        Returns:
            tuple[int, set[str]]: le contenu de la page et du cache
        """
        if not reset:
            #
            serializable: tuple[int, set[str]] | None = loadstate()
            if isinstance(serializable, tuple):
                return serializable
        return 1, set()
    def _ensuretitle(self, f: TextIOWrapper, title: str) -> None:
        """
        check si le titre est bien présent au début du buffer
        sinon il l'ecrit, petit bug potentiel, a+ ecrit tout le
        temps a la fin du buffer, si on a ecrit des choses avant
        le titre sera apres ces données mais on part du principe
        que personne va toucher le fichier.
        Args:
            f (TextIOWrapper): buffer stream fichier
            title (str): titre du csv
        """
        _ = f.seek(0, SEEK_SET)
        if not (f.read(len(title)) == title):
            _ = f.write(title)
        else:
            _ = f.seek(0, SEEK_END)
    def getvins(self, subdir: str, filename: str, reset: bool = False) -> None:
        """
        Scrape  toutes les pages d'une catégorie et sauvegarde en CSV.
@@ -458,13 +420,35 @@ class Scraper:
        mode: Literal["w", "a+"] = "w" if reset else "a+"
        # titre
        title: str = "Appellation,Robert,Robinson,Suckling,Prix\n"
-        # page: page où commence le scraper
+        # page du début
-        # cache: tout les pages déjà parcourir
+        page: int = 1
-        page, cache = self._initstate(reset)
+        # le set qui sert de cache
        cache: set[str] = set[str]()
        custom_format = "{l_bar} {bar:20} {r_bar}"
        if not reset:
            # appelle la fonction pour load le cache, si il existe
            # pas, il utilise les variables de base sinon il override
            # toute les variables pour continuer et pas recommencer le
            # processus en entier.
            serializable: tuple[int, set[str]] | None = loadstate()
            if isinstance(serializable, tuple):
                # override la page et le cache
                page, cache = serializable
        try:
            with open(filename, mode) as f:
-                self._ensuretitle(f, title)
+                # check si le titre est bien présent au début du buffer
                # sinon il l'ecrit, petit bug potentiel, a+ ecrit tout le
                # temps a la fin du buffer, si on a ecrit des choses avant
                # le titre sera apres ces données mais on part du principe
                # que personne va toucher le fichier.
                _ = f.seek(0, SEEK_SET)
                if not (f.read(len(title)) == title):
                    _ = f.write(title)
                else:
                    _ = f.seek(0, SEEK_END)
                while True:
                    products_list: list[dict[str, Any]] | None = (
                        self._geturlproductslist(f"{subdir}?page={page}")
@@ -473,21 +457,15 @@ class Scraper:
                        break
                    pbar: tqdm[dict[str, Any]] = tqdm(
-                        products_list, bar_format="{l_bar} {bar:20} {r_bar}"
+                        products_list, bar_format=custom_format
                    )
                    for product in pbar:
-                        keyword: str = cast(
+                        keyword = product.get("seoKeyword", "Inconnu")[:40]
                            str, product.get("seoKeyword", "Inconnu")[:40]
                        )
                        pbar.set_description(
                            f"Page: {page:<3} | Product: {keyword:<40}"
                        )
                        self._writevins(cache, product, f)
                    page += 1
                    # va créer un fichier au début et l'override
                    # tout les 5 pages au cas où SIGHUP ou autre
                    if page % 5 == 0 and not reset:
                        savestate((page, cache))
        except (Exception, HTTPError, KeyboardInterrupt, JSONDecodeError):
            if not reset:
                savestate((page, cache))
--- a/tests/test_cleaning.py
+++ b/tests/test_cleaning.py
@@ -1,67 +0,0 @@
 import pytest
 from unittest.mock import patch, mock_open
 from cleaning import Cleaning
@pytest.fixture
 def cleaning_raw() -> Cleaning:
    """
    "Appellation": ["Pauillac", "Pauillac ", "Margaux", None  , "Pomerol", "Pomerol"],
    "Robert":      ["95"      , None       , "bad"    , 90    , None     , None     ],
    "Robinson":    [None      , "93"       , 18       , None  , None     , None     ],
    "Suckling":    [96        , None       , None     , None  , 91       , None     ],
    "Prix":        ["10.0"    , "11.0"     , "20.0"   , "30.0", "40.0"   , "50.0"   ],
    """
    csv_content = """Appellation,Robert,Robinson,Suckling,Prix
 Pauillac,95,,96,10.0
 Pauillac ,,93,,11.0
 Margaux,bad,18,,20.0
 ,90,,,30.0
 Pomerol,,,91,40.0
 Pomerol,,,,50.0
 """
    m = mock_open(read_data=csv_content)
    with patch("builtins.open", m):
        return Cleaning("donnee.csv")
 def test_drop_empty_appellation(cleaning_raw: Cleaning) -> None:
    out = cleaning_raw.drop_empty_appellation().getVins()
    assert out["Appellation"].isna().sum() == 0
    assert len(out) == 5
 def test_mean_score_zero_when_no_scores(cleaning_raw: Cleaning) -> None:
    out = cleaning_raw.drop_empty_appellation()
    m = out._mean_score("Robert")
    assert list(m.columns) == ["Appellation", "mean_Robert"]
    pomerol_mean = m.loc[m["Appellation"].str.strip() == "Pomerol", "mean_Robert"].iloc[
        0
    ]
    assert pomerol_mean == 0
 def test_fill_missing_scores(cleaning_raw: Cleaning):
    cleaning_raw._vins["Appellation"] = cleaning_raw._vins["Appellation"].str.strip()
    cleaning_raw.drop_empty_appellation()
    filled = cleaning_raw.fill_missing_scores().getVins()
    for col in cleaning_raw.SCORE_COLS:
        assert filled[col].isna().sum() == 0
    pauillac_robert = filled[filled["Appellation"] == "Pauillac"]["Robert"]
    assert (pauillac_robert == 95.0).all()
 def test_encode_appellation(cleaning_raw: Cleaning):
    cleaning_raw._vins["Appellation"] = cleaning_raw._vins["Appellation"].str.strip()
    out = (
        cleaning_raw.drop_empty_appellation()
        .fill_missing_scores()
        .encode_appellation()
        .getVins()
    )
    assert "App_Appellation" not in out.columns
    assert "App_Pauillac" in out.columns
    assert int(out.loc[0, "App_Pauillac"]) == 1
--- a/tests/test_scraper.py
+++ b/tests/test_scraper.py