ajout de la reprise automatique du scraping dans getvins

This commit is contained in:
Chahrazad650
2026-02-25 02:48:55 +01:00
parent 99dd71989d
commit 73c6221080

View File

@@ -3,9 +3,12 @@
from sys import argv from sys import argv
from typing import cast from typing import cast
from requests import HTTPError, Response, Session from requests import HTTPError, Response, Session
from requests.exceptions import Timeout, ConnectionError
import time
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from collections import OrderedDict from collections import OrderedDict
from json import JSONDecodeError, loads from json import JSONDecodeError, loads
from pathlib import Path
class _ScraperData: class _ScraperData:
@@ -171,6 +174,12 @@ class Scraper:
# Très utile pour éviter de renvoyer toujours les mêmes handshake # Très utile pour éviter de renvoyer toujours les mêmes handshake
# TCP et d'avoir toujours une connexion constante avec le server # TCP et d'avoir toujours une connexion constante avec le server
self._session: Session = Session() self._session: Session = Session()
self._session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36",
"Accept-Language": "fr-FR,fr;q=0.9,en;q=0.8",
})
# Système de cache pour éviter de solliciter le serveur inutilement # Système de cache pour éviter de solliciter le serveur inutilement
self._latest_request: tuple[(str, Response)] | None = None self._latest_request: tuple[(str, Response)] | None = None
self._latest_soups: OrderedDict[str, BeautifulSoup] = OrderedDict[ self._latest_soups: OrderedDict[str, BeautifulSoup] = OrderedDict[
@@ -191,12 +200,20 @@ class Scraper:
HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx). HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx).
""" """
target_url: str = self._url + subdir.lstrip("/") target_url: str = self._url + subdir.lstrip("/")
print(f"[DEBUG] GET {target_url}")
response: Response = self._session.get(url=target_url, timeout=10) last_exc: Exception | None = None
print(f"[DEBUG] status={response.status_code} len={len(response.text)}") for attempt in range(1, 4):
print(f"[DEBUG] head={response.text[:120].replace('\\n',' ')}") try:
response.raise_for_status() response: Response = self._session.get(url=target_url, timeout=30)
return response response.raise_for_status()
return response
except (Timeout, ConnectionError) as e:
last_exc = e
print(f"Timeout/ConnectionError ({attempt}/3) sur {target_url}: {e}")
time.sleep(2 * attempt) # 2s, 4s, 6s
# après 3 essais, on abandonne
raise last_exc if last_exc else RuntimeError("Request failed")
def getresponse(self, subdir: str = "", use_cache: bool = True) -> Response: def getresponse(self, subdir: str = "", use_cache: bool = True) -> Response:
""" """
@@ -307,22 +324,38 @@ class Scraper:
data: dict[str, object] = self.getjsondata(subdir).getdata() data: dict[str, object] = self.getjsondata(subdir).getdata()
for element in ["initialReduxState", "categ", "content"]: for element in ["initialReduxState", "categ", "content"]:
nxt = data.get(element) data: dict[str, object] = cast(dict[str, object], data.get(element))
print("DEBUG key", element, "->", type(nxt)) if not isinstance(data, dict):
if not isinstance(nxt, dict):
print("DEBUG structure manquante, stop sur", element)
return None return None
data = nxt
products = data.get("products") products: list[str] = cast(list[str], data.get("products"))
print("DEBUG products type:", type(products), "len:", 0 if not isinstance(products, list) else len(products))
if isinstance(products, list): if isinstance(products, list):
return products return products
except (JSONDecodeError, HTTPError) as e: except (JSONDecodeError, HTTPError):
print(f"DEBUG HTTP/JSON error sur {subdir}: {type(e).__name__} {e}")
return None return None
def _save_progress(self, page: int, i: int, last_link: str) -> None:
Path("progress.txt").write_text(f"{page},{i},{last_link}", encoding="utf-8")
def _load_progress(self) -> tuple[int, int, str | None]:
p = Path("progress.txt")
if not p.exists():
return (1, 0, None)
try:
parts = p.read_text(encoding="utf-8").strip().split(",", 2)
page = int(parts[0])
i = int(parts[1])
last_link = parts[2] if len(parts) == 3 and parts[2] != "" else None
return (page, i, last_link)
except Exception:
return (1, 0, None)
def getvins(self, subdir: str, filename: str): def getvins(self, subdir: str, filename: str):
"""_summary_ """_summary_
@@ -330,11 +363,17 @@ class Scraper:
subdir (str): _description_ subdir (str): _description_
filename (str): _description_ filename (str): _description_
""" """
with open(filename, "a") as f: start_page, start_i, last_link = self._load_progress()
print(f"__INFO__ Reprise à page={start_page}, index={start_i}, last_link={last_link}")
with open(filename, "a", encoding="utf-8") as f:
cache: set[str] = set[str]() cache: set[str] = set[str]()
page = 0
_ = f.write("Appellation,Robert,Robinson,Suckling,Prix\n") if f.tell() == 0:
_ = f.write("Appellation,Robert,Robinson,Suckling,Prix\n")
page = start_page - 1
while True: while True:
page += 1 page += 1
products_list = self._geturlproductslist(f"{subdir}?page={page}") products_list = self._geturlproductslist(f"{subdir}?page={page}")
@@ -343,24 +382,40 @@ class Scraper:
break break
products_list_length = len(products_list) products_list_length = len(products_list)
for i, product in enumerate(products_list): start_at = start_i if page == start_page else 0
for i in range(start_at, products_list_length):
product = products_list[i]
if not isinstance(product, dict): if not isinstance(product, dict):
continue continue
link = product.get("seoKeyword") link = product.get("seoKeyword")
if not link:
continue
# pour eviter les doublons :
if (page == start_page) and (last_link is not None) and (link == last_link):
self._save_progress(page, + 1, link)
continue
self._save_progress(page, i + 1, link)
if link in cache:
continue
try:
infos = self.getjsondata(link).informations()
_ = f.write(infos + "\n")
print(f"page: {page} | {i + 1}/{products_list_length} {link}")
cache.add(link)
except (JSONDecodeError, HTTPError) as e:
print(f"Erreur sur le produit {link}: {e}")
if link and link not in cache:
try:
infos = self.getjsondata(link).informations()
_ = f.write(infos + "\n")
print(
f"page: {page} | {i + 1}/{products_list_length} {link}"
)
cache.add(link)
except (JSONDecodeError, HTTPError) as e:
print(f"Erreur sur le produit {link}: {e}")
f.flush() f.flush()
Path("progress.txt").unlink(missing_ok=True)
def main() -> None: def main() -> None:
if len(argv) != 2: if len(argv) != 2: