ajout de la reprise automatique du scraping dans getvins

2026-03-28 19:13:42 +00:00 · 2026-02-25 02:48:55 +01:00
parent 99dd71989d
commit 73c6221080
1 changed files with 85 additions and 30 deletions
--- a/scraper.py
+++ b/scraper.py
@@ -3,9 +3,12 @@
 from sys import argv
 from typing import cast
 from requests import HTTPError, Response, Session
 from requests.exceptions import Timeout, ConnectionError
 import time
 from bs4 import BeautifulSoup, Tag
 from collections import OrderedDict
 from json import JSONDecodeError, loads
 from pathlib import Path
 class _ScraperData:
@@ -171,6 +174,12 @@ class Scraper:
        # Très utile pour éviter de renvoyer toujours les mêmes handshake
        # TCP et d'avoir toujours une connexion constante avec le server
        self._session: Session = Session()
        self._session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/122.0.0.0 Safari/537.36",
            "Accept-Language": "fr-FR,fr;q=0.9,en;q=0.8",
        })
        # Système de cache pour éviter de solliciter le serveur inutilement
        self._latest_request: tuple[(str, Response)] | None = None
        self._latest_soups: OrderedDict[str, BeautifulSoup] = OrderedDict[
@@ -191,12 +200,20 @@ class Scraper:
            HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx).
        """
        target_url: str = self._url + subdir.lstrip("/")
-        print(f"[DEBUG] GET {target_url}")
+        
-        response: Response = self._session.get(url=target_url, timeout=10)
+        last_exc: Exception | None = None
-        print(f"[DEBUG] status={response.status_code} len={len(response.text)}")
+        for attempt in range(1, 4):
-        print(f"[DEBUG] head={response.text[:120].replace('\\n',' ')}")
+            try:
-        response.raise_for_status()
+                response: Response = self._session.get(url=target_url, timeout=30)
-        return response
+                response.raise_for_status()
                return response
            except (Timeout, ConnectionError) as e:
                last_exc = e
                print(f"Timeout/ConnectionError ({attempt}/3) sur {target_url}: {e}")
                time.sleep(2 * attempt)  # 2s, 4s, 6s
        # après 3 essais, on abandonne
        raise last_exc if last_exc else RuntimeError("Request failed")
    def getresponse(self, subdir: str = "", use_cache: bool = True) -> Response:
        """
@@ -307,22 +324,38 @@ class Scraper:
            data: dict[str, object] = self.getjsondata(subdir).getdata()
            for element in ["initialReduxState", "categ", "content"]:
-                nxt = data.get(element)
+                data: dict[str, object] = cast(dict[str, object], data.get(element))
-                print("DEBUG key", element, "->", type(nxt))
+                if not isinstance(data, dict):
                if not isinstance(nxt, dict):
                    print("DEBUG structure manquante, stop sur", element)
                    return None
                data = nxt
-            products = data.get("products")
+            products: list[str] = cast(list[str], data.get("products"))
            print("DEBUG products type:", type(products), "len:", 0 if not isinstance(products, list) else len(products))
            if isinstance(products, list):
                return products
-        except (JSONDecodeError, HTTPError) as e:
+        except (JSONDecodeError, HTTPError):
            print(f"DEBUG HTTP/JSON error sur {subdir}: {type(e).__name__} {e}")
            return None
    def _save_progress(self, page: int, i: int, last_link: str) -> None:
        Path("progress.txt").write_text(f"{page},{i},{last_link}", encoding="utf-8")
    def _load_progress(self) -> tuple[int, int, str | None]:
        p = Path("progress.txt")
        if not p.exists():
            return (1, 0, None)
        try:
            parts = p.read_text(encoding="utf-8").strip().split(",", 2)
            page = int(parts[0])
            i = int(parts[1])
            last_link = parts[2] if len(parts) == 3 and parts[2] != "" else None
            return (page, i, last_link)
        except Exception:
            return (1, 0, None)
    def getvins(self, subdir: str, filename: str):
        """_summary_
@@ -330,11 +363,17 @@ class Scraper:
            subdir (str): _description_
            filename (str): _description_
        """
-        with open(filename, "a") as f:
+        start_page, start_i, last_link = self._load_progress()
        print(f"__INFO__ Reprise à page={start_page}, index={start_i}, last_link={last_link}")
        with open(filename, "a", encoding="utf-8") as f:
            cache: set[str] = set[str]()
-            page = 0
+            
-            _ = f.write("Appellation,Robert,Robinson,Suckling,Prix\n")
+            if f.tell() == 0:
-
+                _ = f.write("Appellation,Robert,Robinson,Suckling,Prix\n")
            page = start_page - 1
            while True:
                page += 1
                products_list = self._geturlproductslist(f"{subdir}?page={page}")
@@ -343,24 +382,40 @@ class Scraper:
                    break
                products_list_length = len(products_list)
-                for i, product in enumerate(products_list):
+                start_at = start_i if page == start_page else 0
                for i in range(start_at, products_list_length):
                    product = products_list[i]
                    if not isinstance(product, dict):
                        continue
                    link = product.get("seoKeyword")
                    if not link:
                        continue
                    # pour eviter les doublons :
                    if (page == start_page) and (last_link is not None) and (link == last_link):
                        self._save_progress(page, + 1, link)
                        continue
                    self._save_progress(page, i + 1, link)
                    if link in cache:
                        continue
                    try:
                        infos = self.getjsondata(link).informations()
                        _ = f.write(infos + "\n")
                        print(f"page: {page} | {i + 1}/{products_list_length} {link}")
                        cache.add(link)
                    except (JSONDecodeError, HTTPError) as e:
                        print(f"Erreur sur le produit {link}: {e}")
                    if link and link not in cache:
                        try:
                            infos = self.getjsondata(link).informations()
                            _ = f.write(infos + "\n")
                            print(
                                f"page: {page} | {i + 1}/{products_list_length} {link}"
                            )
                            cache.add(link)
                        except (JSONDecodeError, HTTPError) as e:
                            print(f"Erreur sur le produit {link}: {e}")
                f.flush()
        Path("progress.txt").unlink(missing_ok=True)
 def main() -> None:
    if len(argv) != 2: