1 Commits

Author SHA1 Message Date
0f6eb856c6 ajout: restructuration des fichiers et modifications scraper 2026-03-01 19:39:57 +01:00
9 changed files with 74 additions and 325 deletions

View File

@@ -1,106 +0,0 @@
#!/usr/bin/env python3
from pandas import DataFrame, to_numeric
import pandas as pd
SCORE_COLS = ["Robert", "Robinson", "Suckling"]
def display_info(df: DataFrame, name: str = "DataFrame") -> None:
"""
Affiche un résumé du DataFrame
-la taille
-types des colonnes
-valeurs manquantes
-statistiques numériques
"""
print(f"\n===== {name} =====")
print(f"Shape : {df.shape[0]} lignes × {df.shape[1]} colonnes")
print("\nTypes des colonnes :")
print(df.dtypes)
print("\nValeurs manquantes :")
print(df.isna().sum())
print("\nStatistiques numériques :")
print(df.describe().round(2))
def drop_empty_appellation(df: DataFrame) -> DataFrame:
return df.dropna(subset=["Appellation"])
def mean_score(df: DataFrame, col: str) -> DataFrame:
"""
Calcule la moyenne d'une colonne de score par appellation.
- Convertit les valeurs en numériques, en remplaçant les non-convertibles par NaN
- Calcule la moyenne par appellation
- Remplace les NaN résultants par 0
"""
tmp = df[["Appellation", col]].copy()
tmp[col] = to_numeric(tmp[col], errors="coerce")
# moyenne par appellation
means = tmp.groupby("Appellation", as_index=False)[col].mean()
means[col] = means[col].fillna(0)
means = means.rename(columns={col: f"mean_{col}"})
return means
def mean_robert(df: DataFrame) -> DataFrame:
return mean_score(df, "Robert")
def mean_robinson(df: DataFrame) -> DataFrame:
return mean_score(df, "Robinson")
def mean_suckling(df: DataFrame) -> DataFrame:
return mean_score(df, "Suckling")
def fill_missing_scores(df: DataFrame) -> DataFrame:
"""
Remplacer les notes manquantes par la moyenne
des vins de la même appellation.
"""
df_copy = df.copy()
df_copy["Appellation"] = df_copy["Appellation"].astype(str).str.strip()
for score in SCORE_COLS:
df_copy[score] = to_numeric(df_copy[score], errors="coerce")
temp_cols: list[str] = []
for score in SCORE_COLS:
mean_df = mean_score(df_copy, score)
mean_name = f"mean_{score}"
temp_cols.append(mean_name)
df_copy = df_copy.merge(mean_df, on="Appellation", how="left")
df_copy[score] = df_copy[score].fillna(df_copy[mean_name])
df_copy = df_copy.drop(columns=temp_cols)
return df_copy
def encode_appellation(df: DataFrame, column: str = "Appellation") -> DataFrame:
"""
Remplace la colonne 'Appellation' par des colonnes indicatrices
"""
df_copy = df.copy()
appellations = df_copy[column].astype(str).str.strip()
appellation_dummies = pd.get_dummies(appellations)
df_copy = df_copy.drop(columns=[column])
return df_copy.join(appellation_dummies)

64
main.py
View File

@@ -1,64 +0,0 @@
#!/usr/bin/env python3
from os import getcwd
from os.path import normpath, join
from sys import argv
from pandas import read_csv, DataFrame
from cleaning import (display_info,
drop_empty_appellation,
mean_robert,
mean_robinson,
mean_suckling,
fill_missing_scores,
encode_appellation)
def load_csv(filename: str) -> DataFrame:
path: str = normpath(join(getcwd(), filename))
return read_csv(path)
def save_csv(df: DataFrame, out_filename: str) -> None:
df.to_csv(out_filename, index=False)
def main() -> None:
if len(argv) != 2:
raise ValueError(f"Usage: {argv[0]} <filename.csv>")
df = load_csv(argv[1])
display_info(df, "Avant le nettoyage")
df = drop_empty_appellation(df)
save_csv(df, "donnee_clean.csv")
display_info(df, "Après nettoyage d'appellations manquantes")
#la moyenne des notes des vins pour chaque appellation
robert_means = mean_robert(df)
save_csv(robert_means, "mean_robert_by_appellation.csv")
display_info(robert_means, "Moyennes Robert par appellation")
robinson_means = mean_robinson(df)
save_csv(robinson_means, "mean_robinson_by_appellation.csv")
display_info(robinson_means, "Moyennes Robinson par appellation")
suckling_means = mean_suckling(df)
save_csv(suckling_means, "mean_suckling_by_appellation.csv")
display_info(suckling_means, "Moyennes Suckling par appellation")
df_missing_scores = fill_missing_scores(df)
save_csv(df_missing_scores, "donnee_filled.csv")
display_info(df_missing_scores, "Après remplissage des notes manquantes par la moyenne de l'appellation")
df_ready = encode_appellation(df_missing_scores)
save_csv(df_ready, "donnee_ready.csv")
display_info(df_ready, "Après remplacer la colonne 'Appellation' par des colonnes indicatrices")
if __name__ == "__main__":
try:
main()
except Exception as e:
print(f"ERREUR: {e}")

15
pyproject.toml Normal file
View File

@@ -0,0 +1,15 @@
[project]
name = "projet-millesima-s6"
version = "0.1.0"
dependencies = [
"requests==2.32.5",
"beautifulsoup4==4.14.3",
"pandas==2.3.3",
]
[project.optional-dependencies]
test = ["pytest==8.4.2", "requests-mock==1.12.1"]
[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"

View File

@@ -1,6 +0,0 @@
requests==2.32.5
requests-mock==1.12.1
beautifulsoup4==4.14.3
pytest==8.4.2
requests-mock==1.12.1
pandas==2.3.3

20
src/main.py Executable file
View File

@@ -0,0 +1,20 @@
#!/usr/bin/env python3
from os import getcwd
from os.path import normpath, join
from sys import argv
from pandas import read_csv, DataFrame
def main() -> None:
if len(argv) != 2:
raise ValueError(f"{argv[0]} <filename.csv>")
path: str = normpath(join(getcwd(), argv[1]))
db: DataFrame = read_csv(path)
print(db.all())
if __name__ == "__main__":
try:
main()
except Exception as e:
print(f"ERREUR: {e}")

View File

@@ -3,12 +3,9 @@
from sys import argv from sys import argv
from typing import cast from typing import cast
from requests import HTTPError, Response, Session from requests import HTTPError, Response, Session
from requests.exceptions import Timeout, ConnectionError
import time
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from collections import OrderedDict from collections import OrderedDict
from json import JSONDecodeError, loads from json import JSONDecodeError, loads
from pathlib import Path
class _ScraperData: class _ScraperData:
@@ -154,7 +151,10 @@ class _ScraperData:
parker = self.parker() parker = self.parker()
robinson = self.robinson() robinson = self.robinson()
suckling = self.suckling() suckling = self.suckling()
prix = self.prix() try:
prix = self.prix()
except ValueError:
prix = None
return f"{appellation},{parker},{robinson},{suckling},{prix}" return f"{appellation},{parker},{robinson},{suckling},{prix}"
@@ -174,12 +174,18 @@ class Scraper:
# Très utile pour éviter de renvoyer toujours les mêmes handshake # Très utile pour éviter de renvoyer toujours les mêmes handshake
# TCP et d'avoir toujours une connexion constante avec le server # TCP et d'avoir toujours une connexion constante avec le server
self._session: Session = Session() self._session: Session = Session()
self._session.headers.update({ # Crée une "fausse carte d'identité" pour éviter que le site nous
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " # bloque car on serait des robots
"AppleWebKit/537.36 (KHTML, like Gecko) " self._session.headers.update(
"Chrome/122.0.0.0 Safari/537.36", {
"Accept-Language": "fr-FR,fr;q=0.9,en;q=0.8", "User-Agent":
}) "Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/122.0.0.0 Safari/537.36",
"Accept-Language":
"fr-FR,fr;q=0.9,en;q=0.8",
}
)
# Système de cache pour éviter de solliciter le serveur inutilement # Système de cache pour éviter de solliciter le serveur inutilement
self._latest_request: tuple[(str, Response)] | None = None self._latest_request: tuple[(str, Response)] | None = None
self._latest_soups: OrderedDict[str, BeautifulSoup] = OrderedDict[ self._latest_soups: OrderedDict[str, BeautifulSoup] = OrderedDict[
@@ -200,20 +206,10 @@ class Scraper:
HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx). HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx).
""" """
target_url: str = self._url + subdir.lstrip("/") target_url: str = self._url + subdir.lstrip("/")
# envoyer une requête GET sur la page si erreur, renvoie un raise
last_exc: Exception | None = None response: Response = self._session.get(url=target_url, timeout=30)
for attempt in range(1, 4): response.raise_for_status()
try: return response
response: Response = self._session.get(url=target_url, timeout=30)
response.raise_for_status()
return response
except (Timeout, ConnectionError) as e:
last_exc = e
print(f"Timeout/ConnectionError ({attempt}/3) sur {target_url}: {e}")
time.sleep(2 * attempt) # 2s, 4s, 6s
# après 3 essais, on abandonne
raise last_exc if last_exc else RuntimeError("Request failed")
def getresponse(self, subdir: str = "", use_cache: bool = True) -> Response: def getresponse(self, subdir: str = "", use_cache: bool = True) -> Response:
""" """
@@ -311,7 +307,7 @@ class Scraper:
return _ScraperData(cast(dict[str, object], current_data)) return _ScraperData(cast(dict[str, object], current_data))
def _geturlproductslist(self, subdir: str): def _geturlproductslist(self, subdir: str) -> list[str] | None:
"""_summary_ """_summary_
Args: Args:
@@ -335,87 +331,45 @@ class Scraper:
except (JSONDecodeError, HTTPError): except (JSONDecodeError, HTTPError):
return None return None
def _save_progress(self, page: int, i: int, last_link: str) -> None: def getvins(self, subdir: str, filename: str) -> None:
Path("progress.txt").write_text(f"{page},{i},{last_link}", encoding="utf-8")
def _load_progress(self) -> tuple[int, int, str | None]:
p = Path("progress.txt")
if not p.exists():
return (1, 0, None)
try:
parts = p.read_text(encoding="utf-8").strip().split(",", 2)
page = int(parts[0])
i = int(parts[1])
last_link = parts[2] if len(parts) == 3 and parts[2] != "" else None
return (page, i, last_link)
except Exception:
return (1, 0, None)
def getvins(self, subdir: str, filename: str):
"""_summary_ """_summary_
Args: Args:
subdir (str): _description_ subdir (str): _description_
filename (str): _description_ filename (str): _description_
""" """
start_page, start_i, last_link = self._load_progress() with open(filename, "w") as f:
print(f"__INFO__ Reprise à page={start_page}, index={start_i}, last_link={last_link}")
with open(filename, "a", encoding="utf-8") as f:
cache: set[str] = set[str]() cache: set[str] = set[str]()
page = 0
if f.tell() == 0: _ = f.write("Appellation,Robert,Robinson,Suckling,Prix\n")
_ = f.write("Appellation,Robert,Robinson,Suckling,Prix\n")
page = start_page - 1
while True: while True:
page += 1 page += 1
products_list = self._geturlproductslist(f"{subdir}?page={page}") products_list: list[str] | None = \
self._geturlproductslist(f"{subdir}?page={page}")
if not products_list: if not products_list:
break break
products_list_length = len(products_list) products_list_length = len(products_list)
start_at = start_i if page == start_page else 0 for i, product in enumerate(products_list):
for i in range(start_at, products_list_length):
product = products_list[i]
if not isinstance(product, dict): if not isinstance(product, dict):
continue continue
link = product.get("seoKeyword") link = product.get("seoKeyword")
if not link:
continue
# pour eviter les doublons :
if (page == start_page) and (last_link is not None) and (link == last_link):
self._save_progress(page, + 1, link)
continue
self._save_progress(page, i + 1, link)
if link in cache:
continue
try:
infos = self.getjsondata(link).informations()
_ = f.write(infos + "\n")
print(f"page: {page} | {i + 1}/{products_list_length} {link}")
cache.add(link)
except (JSONDecodeError, HTTPError) as e:
print(f"Erreur sur le produit {link}: {e}")
if link and link not in cache:
try:
infos = self.getjsondata(link).informations()
_ = f.write(infos + "\n")
print(
f"page: {page} | {i + 1}/{products_list_length} {link}"
)
cache.add(link)
except (JSONDecodeError, HTTPError) as e:
print(f"Erreur sur le produit {link}: {e}")
f.flush() f.flush()
Path("progress.txt").unlink(missing_ok=True)
def main() -> None: def main() -> None:
if len(argv) != 2: if len(argv) != 2:

View File

@@ -1,64 +0,0 @@
import pandas as pd
import pytest
from pandas import DataFrame
from cleaning import (
SCORE_COLS,
drop_empty_appellation,
mean_score,
fill_missing_scores,
encode_appellation,
)
@pytest.fixture
def df_raw() -> DataFrame:
return pd.DataFrame({
"Appellation": ["Pauillac", "Pauillac ", "Margaux", None, "Pomerol", "Pomerol"],
"Robert": ["95", None, "bad", 90, None, None],
"Robinson": [None, "93", 18, None, None, None],
"Suckling": [96, None, None, None, 91, None],
"Prix": ["10.0", "11.0", "20.0", "30.0", "40.0", "50.0"],
})
def test_drop_empty_appellation(df_raw: DataFrame):
out = drop_empty_appellation(df_raw)
assert out["Appellation"].isna().sum() == 0
assert len(out) == 5
def test_mean_score_zero_when_no_scores(df_raw: DataFrame):
out = drop_empty_appellation(df_raw)
m = mean_score(out, "Robert")
assert list(m.columns) == ["Appellation", "mean_Robert"]
# Pomerol n'a aucune note Robert => moyenne doit être 0
pomerol_mean = m.loc[m["Appellation"].str.strip() == "Pomerol", "mean_Robert"].iloc[0]
assert pomerol_mean == 0
def test_fill_missing_scores(df_raw: DataFrame):
out = drop_empty_appellation(df_raw)
filled = fill_missing_scores(out)
# plus de NaN dans les colonnes de scores
for col in SCORE_COLS:
assert filled[col].isna().sum() == 0
assert filled.loc[1, "Robert"] == 95.0
# pas de colonnes temporaires mean_*
for col in SCORE_COLS:
assert f"mean_{col}" not in filled.columns
def test_encode_appellation(df_raw: DataFrame):
out = drop_empty_appellation(df_raw)
filled = fill_missing_scores(out)
encoded = encode_appellation(filled)
# la colonne texte disparaît
assert "Appellation" not in encoded.columns
assert "Pauillac" in encoded.columns
assert encoded.loc[0, "Pauillac"] == 1