4 Commits

Author SHA1 Message Date
Chahrazad650
5afb6e38fe ajout : moyennes des notes par appellation 2026-02-26 21:11:43 +01:00
Chahrazad650
f31de22693 Q9 suppression les lignes sans appellation 2026-02-25 03:49:36 +01:00
Chahrazad650
73c6221080 ajout de la reprise automatique du scraping dans getvins 2026-02-25 02:48:55 +01:00
Chahrazad650
99dd71989d debuger _geturlproductslist et request -erreur 403 2026-02-25 00:10:00 +01:00
3 changed files with 174 additions and 26 deletions

48
cleaning.py Normal file
View File

@@ -0,0 +1,48 @@
#!/usr/bin/env python3
from pandas import DataFrame, to_numeric
def display_info(df: DataFrame) -> None:
print(df.all())
print(df.info())
print("\nNombre de valeurs manquantes par colonne :")
print(df.isna().sum())
def drop_empty_appellation(df: DataFrame) -> DataFrame:
return df.dropna(subset=["Appellation"])
def mean_score(df: DataFrame, col: str) -> DataFrame:
"""
Calcule la moyenne d'une colonne de score par appellation.
- Convertit les valeurs en numériques, en remplaçant les non-convertibles par NaN
- Calcule la moyenne par appellation
- Remplace les NaN résultants par 0
"""
tmp = df[["Appellation", col]].copy()
tmp[col] = to_numeric(tmp[col], errors="coerce")
# moyenne par appellation
means = tmp.groupby("Appellation", as_index=False)[col].mean()
means[col] = means[col].fillna(0)
means = means.rename(columns={col: f"mean_{col}"})
return means
def mean_robert(df: DataFrame) -> DataFrame:
return mean_score(df, "Robert")
def mean_robinson(df: DataFrame) -> DataFrame:
return mean_score(df, "Robinson")
def mean_suckling(df: DataFrame) -> DataFrame:
return mean_score(df, "Suckling")

48
main.py
View File

@@ -5,13 +5,53 @@ from os.path import normpath, join
from sys import argv from sys import argv
from pandas import read_csv, DataFrame from pandas import read_csv, DataFrame
from cleaning import (display_info,
drop_empty_appellation,
mean_robert,
mean_robinson,
mean_suckling)
def load_csv(filename: str) -> DataFrame:
path: str = normpath(join(getcwd(), filename))
return read_csv(path)
def save_csv(df: DataFrame, out_filename: str) -> None:
df.to_csv(out_filename, index=False)
def main() -> None: def main() -> None:
if len(argv) != 2: if len(argv) != 2:
raise ValueError(f"{argv[0]} <filename.csv>") raise ValueError(f"Usage: {argv[0]} <filename.csv>")
df = load_csv(argv[1])
print("=== Avant nettoyage ===")
display_info(df)
df = drop_empty_appellation(df)
save_csv(df, "donnee_clean.csv")
print("\n=== Après nettoyage d'appellations manquantes ===")
display_info(df)
#la moyenne des notes des vins pour chaque appellation
robert_means = mean_robert(df)
save_csv(robert_means, "mean_robert_by_appellation.csv")
print("\n=== moyenne Robert par appellation ===")
print(robert_means.head(10))
robinson_means = mean_robinson(df)
save_csv(robinson_means, "mean_robinson_by_appellation.csv")
print("\n===: moyennes Robinson par appellation ===")
print(robinson_means.head(10))
suckling_means = mean_suckling(df)
save_csv(suckling_means, "mean_suckling_by_appellation.csv")
print("\n===: moyennes Suckling par appellation ===")
print(suckling_means.head(10))
path: str = normpath(join(getcwd(), argv[1]))
db: DataFrame = read_csv(path)
print(db.all())
if __name__ == "__main__": if __name__ == "__main__":
try: try:

View File

@@ -3,9 +3,12 @@
from sys import argv from sys import argv
from typing import cast from typing import cast
from requests import HTTPError, Response, Session from requests import HTTPError, Response, Session
from requests.exceptions import Timeout, ConnectionError
import time
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from collections import OrderedDict from collections import OrderedDict
from json import JSONDecodeError, loads from json import JSONDecodeError, loads
from pathlib import Path
class _ScraperData: class _ScraperData:
@@ -151,10 +154,7 @@ class _ScraperData:
parker = self.parker() parker = self.parker()
robinson = self.robinson() robinson = self.robinson()
suckling = self.suckling() suckling = self.suckling()
try: prix = self.prix()
prix = self.prix()
except ValueError:
prix = None
return f"{appellation},{parker},{robinson},{suckling},{prix}" return f"{appellation},{parker},{robinson},{suckling},{prix}"
@@ -174,6 +174,12 @@ class Scraper:
# Très utile pour éviter de renvoyer toujours les mêmes handshake # Très utile pour éviter de renvoyer toujours les mêmes handshake
# TCP et d'avoir toujours une connexion constante avec le server # TCP et d'avoir toujours une connexion constante avec le server
self._session: Session = Session() self._session: Session = Session()
self._session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36",
"Accept-Language": "fr-FR,fr;q=0.9,en;q=0.8",
})
# Système de cache pour éviter de solliciter le serveur inutilement # Système de cache pour éviter de solliciter le serveur inutilement
self._latest_request: tuple[(str, Response)] | None = None self._latest_request: tuple[(str, Response)] | None = None
self._latest_soups: OrderedDict[str, BeautifulSoup] = OrderedDict[ self._latest_soups: OrderedDict[str, BeautifulSoup] = OrderedDict[
@@ -194,9 +200,20 @@ class Scraper:
HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx). HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx).
""" """
target_url: str = self._url + subdir.lstrip("/") target_url: str = self._url + subdir.lstrip("/")
response: Response = self._session.get(url=target_url, timeout=10)
response.raise_for_status() last_exc: Exception | None = None
return response for attempt in range(1, 4):
try:
response: Response = self._session.get(url=target_url, timeout=30)
response.raise_for_status()
return response
except (Timeout, ConnectionError) as e:
last_exc = e
print(f"Timeout/ConnectionError ({attempt}/3) sur {target_url}: {e}")
time.sleep(2 * attempt) # 2s, 4s, 6s
# après 3 essais, on abandonne
raise last_exc if last_exc else RuntimeError("Request failed")
def getresponse(self, subdir: str = "", use_cache: bool = True) -> Response: def getresponse(self, subdir: str = "", use_cache: bool = True) -> Response:
""" """
@@ -318,6 +335,27 @@ class Scraper:
except (JSONDecodeError, HTTPError): except (JSONDecodeError, HTTPError):
return None return None
def _save_progress(self, page: int, i: int, last_link: str) -> None:
Path("progress.txt").write_text(f"{page},{i},{last_link}", encoding="utf-8")
def _load_progress(self) -> tuple[int, int, str | None]:
p = Path("progress.txt")
if not p.exists():
return (1, 0, None)
try:
parts = p.read_text(encoding="utf-8").strip().split(",", 2)
page = int(parts[0])
i = int(parts[1])
last_link = parts[2] if len(parts) == 3 and parts[2] != "" else None
return (page, i, last_link)
except Exception:
return (1, 0, None)
def getvins(self, subdir: str, filename: str): def getvins(self, subdir: str, filename: str):
"""_summary_ """_summary_
@@ -325,10 +363,16 @@ class Scraper:
subdir (str): _description_ subdir (str): _description_
filename (str): _description_ filename (str): _description_
""" """
with open(filename, "a") as f: start_page, start_i, last_link = self._load_progress()
print(f"__INFO__ Reprise à page={start_page}, index={start_i}, last_link={last_link}")
with open(filename, "a", encoding="utf-8") as f:
cache: set[str] = set[str]() cache: set[str] = set[str]()
page = 0
_ = f.write("Appellation,Robert,Robinson,Suckling,Prix\n") if f.tell() == 0:
_ = f.write("Appellation,Robert,Robinson,Suckling,Prix\n")
page = start_page - 1
while True: while True:
page += 1 page += 1
@@ -338,24 +382,40 @@ class Scraper:
break break
products_list_length = len(products_list) products_list_length = len(products_list)
for i, product in enumerate(products_list): start_at = start_i if page == start_page else 0
for i in range(start_at, products_list_length):
product = products_list[i]
if not isinstance(product, dict): if not isinstance(product, dict):
continue continue
link = product.get("seoKeyword") link = product.get("seoKeyword")
if not link:
continue
# pour eviter les doublons :
if (page == start_page) and (last_link is not None) and (link == last_link):
self._save_progress(page, + 1, link)
continue
self._save_progress(page, i + 1, link)
if link in cache:
continue
try:
infos = self.getjsondata(link).informations()
_ = f.write(infos + "\n")
print(f"page: {page} | {i + 1}/{products_list_length} {link}")
cache.add(link)
except (JSONDecodeError, HTTPError) as e:
print(f"Erreur sur le produit {link}: {e}")
if link and link not in cache:
try:
infos = self.getjsondata(link).informations()
_ = f.write(infos + "\n")
print(
f"page: {page} | {i + 1}/{products_list_length} {link}"
)
cache.add(link)
except (JSONDecodeError, HTTPError) as e:
print(f"Erreur sur le produit {link}: {e}")
f.flush() f.flush()
Path("progress.txt").unlink(missing_ok=True)
def main() -> None: def main() -> None:
if len(argv) != 2: if len(argv) != 2: