ajout: restructuration des fichiers et modifications scraper

2026-03-31 12:21:34 +00:00 · 2026-03-01 19:39:57 +01:00
8 changed files with 74 additions and 199 deletions
--- a/cleaning.py
+++ b/cleaning.py
@@ -1,48 +0,0 @@
 #!/usr/bin/env python3
 from pandas import DataFrame, to_numeric
 def display_info(df: DataFrame) -> None:
    print(df.all())
    print(df.info())
    print("\nNombre de valeurs manquantes par colonne :")
    print(df.isna().sum())
 def drop_empty_appellation(df: DataFrame) -> DataFrame:
    return df.dropna(subset=["Appellation"])
 def mean_score(df: DataFrame, col: str) -> DataFrame:
    """
    Calcule la moyenne d'une colonne de score par appellation.
        - Convertit les valeurs en numériques, en remplaçant les non-convertibles par NaN
        - Calcule la moyenne par appellation
        - Remplace les NaN résultants par 0 
    """
    tmp = df[["Appellation", col]].copy()
    tmp[col] = to_numeric(tmp[col], errors="coerce")
    # moyenne par appellation
    means = tmp.groupby("Appellation", as_index=False)[col].mean()
    means[col] = means[col].fillna(0)
    means = means.rename(columns={col: f"mean_{col}"})
    return means
 def mean_robert(df: DataFrame) -> DataFrame:
    return mean_score(df, "Robert")
 def mean_robinson(df: DataFrame) -> DataFrame:
    return mean_score(df, "Robinson")
 def mean_suckling(df: DataFrame) -> DataFrame:
    return mean_score(df, "Suckling")
--- a/main.py
+++ b/main.py
@@ -1,60 +0,0 @@
 #!/usr/bin/env python3
 from os import getcwd
 from os.path import normpath, join
 from sys import argv
 from pandas import read_csv, DataFrame
 from cleaning import (display_info,
                      drop_empty_appellation,
                      mean_robert,
                      mean_robinson,
                      mean_suckling)
 def load_csv(filename: str) -> DataFrame:
    path: str = normpath(join(getcwd(), filename))
    return read_csv(path)
 def save_csv(df: DataFrame, out_filename: str) -> None:
    df.to_csv(out_filename, index=False)
 def main() -> None:
    if len(argv) != 2:
        raise ValueError(f"Usage: {argv[0]} <filename.csv>")
    df = load_csv(argv[1])
    print("=== Avant nettoyage ===")
    display_info(df)
    df = drop_empty_appellation(df)
    save_csv(df, "donnee_clean.csv")
    print("\n=== Après nettoyage d'appellations manquantes ===")
    display_info(df)
    #la moyenne des notes des vins pour chaque appellation
    robert_means = mean_robert(df)
    save_csv(robert_means, "mean_robert_by_appellation.csv")
    print("\n=== moyenne Robert par appellation ===")
    print(robert_means.head(10))
    robinson_means = mean_robinson(df)
    save_csv(robinson_means, "mean_robinson_by_appellation.csv")
    print("\n===: moyennes Robinson par appellation ===")
    print(robinson_means.head(10))
    suckling_means = mean_suckling(df)
    save_csv(suckling_means, "mean_suckling_by_appellation.csv")
    print("\n===: moyennes Suckling par appellation ===")
    print(suckling_means.head(10))
 if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print(f"ERREUR: {e}")
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,15 @@
 [project]
 name = "projet-millesima-s6"
 version = "0.1.0"
 dependencies = [
    "requests==2.32.5",
    "beautifulsoup4==4.14.3",
    "pandas==2.3.3",
 ]
 [project.optional-dependencies]
 test = ["pytest==8.4.2", "requests-mock==1.12.1"]
 [build-system]
 requires = ["setuptools"]
 build-backend = "setuptools.build_meta"
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +0,0 @@
 requests==2.32.5
 requests-mock==1.12.1
 beautifulsoup4==4.14.3
 pytest==8.4.2
 requests-mock==1.12.1
 pandas==2.3.3
--- a/src/main.py
+++ b/src/main.py
@@ -0,0 +1,20 @@
 #!/usr/bin/env python3
 from os import getcwd
 from os.path import normpath, join
 from sys import argv
 from pandas import read_csv, DataFrame
 def main() -> None:
    if len(argv) != 2:
        raise ValueError(f"{argv[0]} <filename.csv>")
    path: str = normpath(join(getcwd(), argv[1]))
    db: DataFrame = read_csv(path)
    print(db.all())
 if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print(f"ERREUR: {e}")
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -3,12 +3,9 @@
 from sys import argv
 from typing import cast
 from requests import HTTPError, Response, Session
 from requests.exceptions import Timeout, ConnectionError
 import time
 from bs4 import BeautifulSoup, Tag
 from collections import OrderedDict
 from json import JSONDecodeError, loads
 from pathlib import Path
 class _ScraperData:
@@ -154,7 +151,10 @@ class _ScraperData:
        parker = self.parker()
        robinson = self.robinson()
        suckling = self.suckling()
-        prix = self.prix()
+        try:
            prix = self.prix()
        except ValueError:
            prix = None
        return f"{appellation},{parker},{robinson},{suckling},{prix}"
@@ -174,12 +174,18 @@ class Scraper:
        # Très utile pour éviter de renvoyer toujours les mêmes handshake
        # TCP et d'avoir toujours une connexion constante avec le server
        self._session: Session = Session()
-        self._session.headers.update({
+        # Crée une "fausse carte d'identité" pour éviter que le site nous
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        # bloque car on serait des robots
-                        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        self._session.headers.update(
-                        "Chrome/122.0.0.0 Safari/537.36",
+            {
-            "Accept-Language": "fr-FR,fr;q=0.9,en;q=0.8",
+                "User-Agent": 
-        })
+                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
                     AppleWebKit/537.36 (KHTML, like Gecko) \
                     Chrome/122.0.0.0 Safari/537.36",
                "Accept-Language": 
                    "fr-FR,fr;q=0.9,en;q=0.8",
            }
        )
        # Système de cache pour éviter de solliciter le serveur inutilement
        self._latest_request: tuple[(str, Response)] | None = None
        self._latest_soups: OrderedDict[str, BeautifulSoup] = OrderedDict[
@@ -200,20 +206,10 @@ class Scraper:
            HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx).
        """
        target_url: str = self._url + subdir.lstrip("/")
-        
+        # envoyer une requête GET sur la page si erreur, renvoie un raise
-        last_exc: Exception | None = None
+        response: Response = self._session.get(url=target_url, timeout=30)
-        for attempt in range(1, 4):
+        response.raise_for_status()
-            try:
+        return response
                response: Response = self._session.get(url=target_url, timeout=30)
                response.raise_for_status()
                return response
            except (Timeout, ConnectionError) as e:
                last_exc = e
                print(f"Timeout/ConnectionError ({attempt}/3) sur {target_url}: {e}")
                time.sleep(2 * attempt)  # 2s, 4s, 6s
        # après 3 essais, on abandonne
        raise last_exc if last_exc else RuntimeError("Request failed")
    def getresponse(self, subdir: str = "", use_cache: bool = True) -> Response:
        """
@@ -311,7 +307,7 @@ class Scraper:
        return _ScraperData(cast(dict[str, object], current_data))
-    def _geturlproductslist(self, subdir: str):
+    def _geturlproductslist(self, subdir: str) -> list[str] | None:
        """_summary_
        Args:
@@ -335,87 +331,45 @@ class Scraper:
        except (JSONDecodeError, HTTPError):
            return None
-    def _save_progress(self, page: int, i: int, last_link: str) -> None:
+    def getvins(self, subdir: str, filename: str) -> None:
        Path("progress.txt").write_text(f"{page},{i},{last_link}", encoding="utf-8")
    def _load_progress(self) -> tuple[int, int, str | None]:
        p = Path("progress.txt")
        if not p.exists():
            return (1, 0, None)
        try:
            parts = p.read_text(encoding="utf-8").strip().split(",", 2)
            page = int(parts[0])
            i = int(parts[1])
            last_link = parts[2] if len(parts) == 3 and parts[2] != "" else None
            return (page, i, last_link)
        except Exception:
            return (1, 0, None)
    def getvins(self, subdir: str, filename: str):
        """_summary_
        Args:
            subdir (str): _description_
            filename (str): _description_
        """
-        start_page, start_i, last_link = self._load_progress()
+        with open(filename, "w") as f:
        print(f"__INFO__ Reprise à page={start_page}, index={start_i}, last_link={last_link}")
        with open(filename, "a", encoding="utf-8") as f:
            cache: set[str] = set[str]()
-            
+            page = 0
-            if f.tell() == 0:
+            _ = f.write("Appellation,Robert,Robinson,Suckling,Prix\n")
-                _ = f.write("Appellation,Robert,Robinson,Suckling,Prix\n")
+
            page = start_page - 1
            while True:
                page += 1
-                products_list = self._geturlproductslist(f"{subdir}?page={page}")
+                products_list: list[str] | None = \
                    self._geturlproductslist(f"{subdir}?page={page}")
                if not products_list:
                    break
                products_list_length = len(products_list)
-                start_at = start_i if page == start_page else 0
+                for i, product in enumerate(products_list):
                for i in range(start_at, products_list_length):
                    product = products_list[i]
                    if not isinstance(product, dict):
                        continue
                    link = product.get("seoKeyword")
                    if not link:
                        continue
                    # pour eviter les doublons :
                    if (page == start_page) and (last_link is not None) and (link == last_link):
                        self._save_progress(page, + 1, link)
                        continue
                    self._save_progress(page, i + 1, link)
                    if link in cache:
                        continue
                    try:
                        infos = self.getjsondata(link).informations()
                        _ = f.write(infos + "\n")
                        print(f"page: {page} | {i + 1}/{products_list_length} {link}")
                        cache.add(link)
                    except (JSONDecodeError, HTTPError) as e:
                        print(f"Erreur sur le produit {link}: {e}")
                    if link and link not in cache:
                        try:
                            infos = self.getjsondata(link).informations()
                            _ = f.write(infos + "\n")
                            print(
                                f"page: {page} | {i + 1}/{products_list_length} {link}"
                            )
                            cache.add(link)
                        except (JSONDecodeError, HTTPError) as e:
                            print(f"Erreur sur le produit {link}: {e}")
                f.flush()
        Path("progress.txt").unlink(missing_ok=True)
 def main() -> None:
    if len(argv) != 2:
--- a/tests/test_main.py
+++ b/tests/test_main.py
--- a/tests/test_scraper.py
+++ b/tests/test_scraper.py