mirror of
https://github.com/guezoloic/millesima_projetS6.git
synced 2026-03-28 19:13:42 +00:00
ajout de la reprise automatique du scraping dans getvins
This commit is contained in:
115
scraper.py
115
scraper.py
@@ -3,9 +3,12 @@
|
|||||||
from sys import argv
|
from sys import argv
|
||||||
from typing import cast
|
from typing import cast
|
||||||
from requests import HTTPError, Response, Session
|
from requests import HTTPError, Response, Session
|
||||||
|
from requests.exceptions import Timeout, ConnectionError
|
||||||
|
import time
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from json import JSONDecodeError, loads
|
from json import JSONDecodeError, loads
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
class _ScraperData:
|
class _ScraperData:
|
||||||
@@ -171,6 +174,12 @@ class Scraper:
|
|||||||
# Très utile pour éviter de renvoyer toujours les mêmes handshake
|
# Très utile pour éviter de renvoyer toujours les mêmes handshake
|
||||||
# TCP et d'avoir toujours une connexion constante avec le server
|
# TCP et d'avoir toujours une connexion constante avec le server
|
||||||
self._session: Session = Session()
|
self._session: Session = Session()
|
||||||
|
self._session.headers.update({
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/122.0.0.0 Safari/537.36",
|
||||||
|
"Accept-Language": "fr-FR,fr;q=0.9,en;q=0.8",
|
||||||
|
})
|
||||||
# Système de cache pour éviter de solliciter le serveur inutilement
|
# Système de cache pour éviter de solliciter le serveur inutilement
|
||||||
self._latest_request: tuple[(str, Response)] | None = None
|
self._latest_request: tuple[(str, Response)] | None = None
|
||||||
self._latest_soups: OrderedDict[str, BeautifulSoup] = OrderedDict[
|
self._latest_soups: OrderedDict[str, BeautifulSoup] = OrderedDict[
|
||||||
@@ -191,12 +200,20 @@ class Scraper:
|
|||||||
HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx).
|
HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx).
|
||||||
"""
|
"""
|
||||||
target_url: str = self._url + subdir.lstrip("/")
|
target_url: str = self._url + subdir.lstrip("/")
|
||||||
print(f"[DEBUG] GET {target_url}")
|
|
||||||
response: Response = self._session.get(url=target_url, timeout=10)
|
last_exc: Exception | None = None
|
||||||
print(f"[DEBUG] status={response.status_code} len={len(response.text)}")
|
for attempt in range(1, 4):
|
||||||
print(f"[DEBUG] head={response.text[:120].replace('\\n',' ')}")
|
try:
|
||||||
response.raise_for_status()
|
response: Response = self._session.get(url=target_url, timeout=30)
|
||||||
return response
|
response.raise_for_status()
|
||||||
|
return response
|
||||||
|
except (Timeout, ConnectionError) as e:
|
||||||
|
last_exc = e
|
||||||
|
print(f"Timeout/ConnectionError ({attempt}/3) sur {target_url}: {e}")
|
||||||
|
time.sleep(2 * attempt) # 2s, 4s, 6s
|
||||||
|
|
||||||
|
# après 3 essais, on abandonne
|
||||||
|
raise last_exc if last_exc else RuntimeError("Request failed")
|
||||||
|
|
||||||
def getresponse(self, subdir: str = "", use_cache: bool = True) -> Response:
|
def getresponse(self, subdir: str = "", use_cache: bool = True) -> Response:
|
||||||
"""
|
"""
|
||||||
@@ -307,22 +324,38 @@ class Scraper:
|
|||||||
data: dict[str, object] = self.getjsondata(subdir).getdata()
|
data: dict[str, object] = self.getjsondata(subdir).getdata()
|
||||||
|
|
||||||
for element in ["initialReduxState", "categ", "content"]:
|
for element in ["initialReduxState", "categ", "content"]:
|
||||||
nxt = data.get(element)
|
data: dict[str, object] = cast(dict[str, object], data.get(element))
|
||||||
print("DEBUG key", element, "->", type(nxt))
|
if not isinstance(data, dict):
|
||||||
if not isinstance(nxt, dict):
|
|
||||||
print("DEBUG structure manquante, stop sur", element)
|
|
||||||
return None
|
return None
|
||||||
data = nxt
|
|
||||||
|
|
||||||
products = data.get("products")
|
products: list[str] = cast(list[str], data.get("products"))
|
||||||
print("DEBUG products type:", type(products), "len:", 0 if not isinstance(products, list) else len(products))
|
|
||||||
if isinstance(products, list):
|
if isinstance(products, list):
|
||||||
return products
|
return products
|
||||||
|
|
||||||
except (JSONDecodeError, HTTPError) as e:
|
except (JSONDecodeError, HTTPError):
|
||||||
print(f"DEBUG HTTP/JSON error sur {subdir}: {type(e).__name__} {e}")
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _save_progress(self, page: int, i: int, last_link: str) -> None:
|
||||||
|
Path("progress.txt").write_text(f"{page},{i},{last_link}", encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
def _load_progress(self) -> tuple[int, int, str | None]:
|
||||||
|
p = Path("progress.txt")
|
||||||
|
if not p.exists():
|
||||||
|
return (1, 0, None)
|
||||||
|
|
||||||
|
try:
|
||||||
|
parts = p.read_text(encoding="utf-8").strip().split(",", 2)
|
||||||
|
|
||||||
|
page = int(parts[0])
|
||||||
|
i = int(parts[1])
|
||||||
|
|
||||||
|
last_link = parts[2] if len(parts) == 3 and parts[2] != "" else None
|
||||||
|
return (page, i, last_link)
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
return (1, 0, None)
|
||||||
|
|
||||||
def getvins(self, subdir: str, filename: str):
|
def getvins(self, subdir: str, filename: str):
|
||||||
"""_summary_
|
"""_summary_
|
||||||
|
|
||||||
@@ -330,11 +363,17 @@ class Scraper:
|
|||||||
subdir (str): _description_
|
subdir (str): _description_
|
||||||
filename (str): _description_
|
filename (str): _description_
|
||||||
"""
|
"""
|
||||||
with open(filename, "a") as f:
|
start_page, start_i, last_link = self._load_progress()
|
||||||
|
print(f"__INFO__ Reprise à page={start_page}, index={start_i}, last_link={last_link}")
|
||||||
|
|
||||||
|
with open(filename, "a", encoding="utf-8") as f:
|
||||||
cache: set[str] = set[str]()
|
cache: set[str] = set[str]()
|
||||||
page = 0
|
|
||||||
_ = f.write("Appellation,Robert,Robinson,Suckling,Prix\n")
|
if f.tell() == 0:
|
||||||
|
_ = f.write("Appellation,Robert,Robinson,Suckling,Prix\n")
|
||||||
|
|
||||||
|
page = start_page - 1
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
page += 1
|
page += 1
|
||||||
products_list = self._geturlproductslist(f"{subdir}?page={page}")
|
products_list = self._geturlproductslist(f"{subdir}?page={page}")
|
||||||
@@ -343,24 +382,40 @@ class Scraper:
|
|||||||
break
|
break
|
||||||
|
|
||||||
products_list_length = len(products_list)
|
products_list_length = len(products_list)
|
||||||
for i, product in enumerate(products_list):
|
start_at = start_i if page == start_page else 0
|
||||||
|
|
||||||
|
for i in range(start_at, products_list_length):
|
||||||
|
product = products_list[i]
|
||||||
if not isinstance(product, dict):
|
if not isinstance(product, dict):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
link = product.get("seoKeyword")
|
link = product.get("seoKeyword")
|
||||||
|
if not link:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# pour eviter les doublons :
|
||||||
|
if (page == start_page) and (last_link is not None) and (link == last_link):
|
||||||
|
self._save_progress(page, + 1, link)
|
||||||
|
continue
|
||||||
|
|
||||||
|
self._save_progress(page, i + 1, link)
|
||||||
|
|
||||||
|
if link in cache:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
infos = self.getjsondata(link).informations()
|
||||||
|
_ = f.write(infos + "\n")
|
||||||
|
print(f"page: {page} | {i + 1}/{products_list_length} {link}")
|
||||||
|
cache.add(link)
|
||||||
|
|
||||||
|
except (JSONDecodeError, HTTPError) as e:
|
||||||
|
print(f"Erreur sur le produit {link}: {e}")
|
||||||
|
|
||||||
if link and link not in cache:
|
|
||||||
try:
|
|
||||||
infos = self.getjsondata(link).informations()
|
|
||||||
_ = f.write(infos + "\n")
|
|
||||||
print(
|
|
||||||
f"page: {page} | {i + 1}/{products_list_length} {link}"
|
|
||||||
)
|
|
||||||
cache.add(link)
|
|
||||||
except (JSONDecodeError, HTTPError) as e:
|
|
||||||
print(f"Erreur sur le produit {link}: {e}")
|
|
||||||
f.flush()
|
f.flush()
|
||||||
|
|
||||||
|
Path("progress.txt").unlink(missing_ok=True)
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
if len(argv) != 2:
|
if len(argv) != 2:
|
||||||
|
|||||||
Reference in New Issue
Block a user