Merge pull request #11 from guezoloic/jalon2-loic

Jalon2 loic
This commit is contained in:
Loïc GUEZO
2026-03-06 21:09:52 +01:00
committed by GitHub
6 changed files with 235 additions and 39 deletions

View File

@@ -36,10 +36,3 @@ jobs:
- name: Test with pytest
run: pytest
- name: Deploy Doc
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
run: |
git config user.name github-actions
git config user.email github-actions@github.com
mkdocs gh-deploy --force

View File

@@ -1,7 +1,12 @@
[project]
name = "projet-millesima-s6"
version = "0.1.0"
dependencies = ["requests==2.32.5", "beautifulsoup4==4.14.3", "pandas==2.3.3", "tqdm==4.67.3"]
dependencies = [
"requests==2.32.5",
"beautifulsoup4==4.14.3",
"pandas==2.3.3",
"tqdm==4.67.3",
]
[project.optional-dependencies]
test = ["pytest==8.4.2", "requests-mock==1.12.1", "flake8==7.3.0"]

109
src/cleaning.py Executable file
View File

@@ -0,0 +1,109 @@
#!/usr/bin/env python3
from os import getcwd
from os.path import normpath, join
from typing import cast
from pandas import DataFrame, read_csv, to_numeric, get_dummies
from sys import argv
def path_filename(filename: str) -> str:
return normpath(join(getcwd(), filename))
class Cleaning:
def __init__(self, filename) -> None:
self._vins: DataFrame = read_csv(filename)
#
self.SCORE_COLS: list[str] = [
c for c in self._vins.columns if c not in ["Appellation", "Prix"]
]
#
for col in self.SCORE_COLS:
self._vins[col] = to_numeric(self._vins[col], errors="coerce")
def getVins(self) -> DataFrame:
return self._vins.copy(deep=True)
def __str__(self) -> str:
"""
Affiche un résumé du DataFrame
- la taille
- types des colonnes
- valeurs manquantes
- statistiques numériques
"""
return (
f"Shape : {self._vins.shape[0]} lignes x {self._vins.shape[1]} colonnes\n\n"
f"Types des colonnes :\n{self._vins.dtypes}\n\n"
f"Valeurs manquantes :\n{self._vins.isna().sum()}\n\n"
f"Statistiques numériques :\n{self._vins.describe().round(2)}\n\n"
)
def drop_empty_appellation(self) -> "Cleaning":
self._vins = self._vins.dropna(subset=["Appellation"])
return self
def _mean_score(self, col: str) -> DataFrame:
"""
Calcule la moyenne d'une colonne de score par appellation.
- Convertit les valeurs en numériques, en remplaçant les non-convertibles par NaN
- Calcule la moyenne par appellation
- Remplace les NaN résultants par 0
"""
means = self._vins.groupby("Appellation", as_index=False)[col].mean()
means = means.rename(
columns={col: f"mean_{col}"}
) # pyright: ignore[reportCallIssue]
return cast(DataFrame, means.fillna(0))
def _mean_robert(self) -> DataFrame:
return self._mean_score("Robert")
def _mean_robinson(self) -> DataFrame:
return self._mean_score("Robinson")
def _mean_suckling(self) -> DataFrame:
return self._mean_score("Suckling")
def fill_missing_scores(self) -> "Cleaning":
"""
Remplacer les notes manquantes par la moyenne
des vins de la même appellation.
"""
for element in self.SCORE_COLS:
means = self._mean_score(element)
self._vins = self._vins.merge(means, on="Appellation", how="left")
mean_col = f"mean_{element}"
self._vins[element] = self._vins[element].fillna(self._vins[mean_col])
self._vins = self._vins.drop(columns=["mean_" + element])
return self
def encode_appellation(self, column: str = "Appellation") -> "Cleaning":
"""
Remplace la colonne 'Appellation' par des colonnes indicatrices
"""
appellations = self._vins[column].astype(str).str.strip()
appellation_dummies = get_dummies(appellations)
self._vins = self._vins.drop(columns=[column])
self._vins = self._vins.join(appellation_dummies)
return self
def main() -> None:
if len(argv) != 2:
raise ValueError(f"Usage: {argv[0]} <filename.csv>")
filename = argv[1]
cleaning: Cleaning = Cleaning(filename)
_ = cleaning.drop_empty_appellation().fill_missing_scores().encode_appellation()
if __name__ == "__main__":
try:
main()
except Exception as e:
print(f"ERREUR: {e}")

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3
from collections import OrderedDict
from io import SEEK_END, SEEK_SET, BufferedWriter
from io import SEEK_END, SEEK_SET, BufferedWriter, TextIOWrapper
from json import JSONDecodeError, loads
from os import makedirs
from os.path import dirname, exists, join, normpath, realpath
@@ -215,6 +215,7 @@ class _ScraperData:
robinson = self.robinson()
suckling = self.suckling()
prix = self.prix()
prix = self.prix()
return f"{appellation},{parker},{robinson},{suckling},{prix}"
@@ -383,8 +384,7 @@ class Scraper:
list[dict[str, Any]], data.get("products")
)
if isinstance(products, list):
return products
return products
except (JSONDecodeError, HTTPError):
return None
@@ -407,6 +407,44 @@ class Scraper:
except (JSONDecodeError, HTTPError) as e:
print(f"Erreur sur le produit {link}: {e}")
def _initstate(self, reset: bool) -> tuple[int, set[str]]:
"""
appelle la fonction pour load le cache, si il existe
pas, il utilise les variables de base sinon il override
toute les variables pour continuer et pas recommencer le
processus en entier.
Args:
reset (bool): pouvoir le reset ou pas
Returns:
tuple[int, set[str]]: le contenu de la page et du cache
"""
if not reset:
#
serializable: tuple[int, set[str]] | None = loadstate()
if isinstance(serializable, tuple):
return serializable
return 1, set()
def _ensuretitle(self, f: TextIOWrapper, title: str) -> None:
"""
check si le titre est bien présent au début du buffer
sinon il l'ecrit, petit bug potentiel, a+ ecrit tout le
temps a la fin du buffer, si on a ecrit des choses avant
le titre sera apres ces données mais on part du principe
que personne va toucher le fichier.
Args:
f (TextIOWrapper): buffer stream fichier
title (str): titre du csv
"""
_ = f.seek(0, SEEK_SET)
if not (f.read(len(title)) == title):
_ = f.write(title)
else:
_ = f.seek(0, SEEK_END)
def getvins(self, subdir: str, filename: str, reset: bool = False) -> None:
"""
Scrape toutes les pages d'une catégorie et sauvegarde en CSV.
@@ -420,35 +458,13 @@ class Scraper:
mode: Literal["w", "a+"] = "w" if reset else "a+"
# titre
title: str = "Appellation,Robert,Robinson,Suckling,Prix\n"
# page du début
page: int = 1
# le set qui sert de cache
cache: set[str] = set[str]()
# page: page où commence le scraper
# cache: tout les pages déjà parcourir
page, cache = self._initstate(reset)
custom_format = "{l_bar} {bar:20} {r_bar}"
if not reset:
# appelle la fonction pour load le cache, si il existe
# pas, il utilise les variables de base sinon il override
# toute les variables pour continuer et pas recommencer le
# processus en entier.
serializable: tuple[int, set[str]] | None = loadstate()
if isinstance(serializable, tuple):
# override la page et le cache
page, cache = serializable
try:
with open(filename, mode) as f:
# check si le titre est bien présent au début du buffer
# sinon il l'ecrit, petit bug potentiel, a+ ecrit tout le
# temps a la fin du buffer, si on a ecrit des choses avant
# le titre sera apres ces données mais on part du principe
# que personne va toucher le fichier.
_ = f.seek(0, SEEK_SET)
if not (f.read(len(title)) == title):
_ = f.write(title)
else:
_ = f.seek(0, SEEK_END)
self._ensuretitle(f, title)
while True:
products_list: list[dict[str, Any]] | None = (
self._geturlproductslist(f"{subdir}?page={page}")
@@ -457,15 +473,21 @@ class Scraper:
break
pbar: tqdm[dict[str, Any]] = tqdm(
products_list, bar_format=custom_format
products_list, bar_format="{l_bar} {bar:20} {r_bar}"
)
for product in pbar:
keyword = product.get("seoKeyword", "Inconnu")[:40]
keyword: str = cast(
str, product.get("seoKeyword", "Inconnu")[:40]
)
pbar.set_description(
f"Page: {page:<3} | Product: {keyword:<40}"
)
self._writevins(cache, product, f)
page += 1
# va créer un fichier au début et l'override
# tout les 5 pages au cas où SIGHUP ou autre
if page % 5 == 0 and not reset:
savestate((page, cache))
except (Exception, HTTPError, KeyboardInterrupt, JSONDecodeError):
if not reset:
savestate((page, cache))

67
tests/test_cleaning.py Executable file
View File

@@ -0,0 +1,67 @@
import pytest
from unittest.mock import patch, mock_open
from cleaning import Cleaning
@pytest.fixture
def cleaning_raw() -> Cleaning:
"""
"Appellation": ["Pauillac", "Pauillac ", "Margaux", None , "Pomerol", "Pomerol"],
"Robert": ["95" , None , "bad" , 90 , None , None ],
"Robinson": [None , "93" , 18 , None , None , None ],
"Suckling": [96 , None , None , None , 91 , None ],
"Prix": ["10.0" , "11.0" , "20.0" , "30.0", "40.0" , "50.0" ],
"""
csv_content = """Appellation,Robert,Robinson,Suckling,Prix
Pauillac,95,,96,10.0
Pauillac ,,93,,11.0
Margaux,bad,18,,20.0
,90,,,30.0
Pomerol,,,91,40.0
Pomerol,,,,50.0
"""
m = mock_open(read_data=csv_content)
with patch("builtins.open", m):
return Cleaning("donnee.csv")
def test_drop_empty_appellation(cleaning_raw: Cleaning) -> None:
out = cleaning_raw.drop_empty_appellation().getVins()
assert out["Appellation"].isna().sum() == 0
assert len(out) == 5
def test_mean_score_zero_when_no_scores(cleaning_raw: Cleaning) -> None:
out = cleaning_raw.drop_empty_appellation()
m = out._mean_score("Robert")
assert list(m.columns) == ["Appellation", "mean_Robert"]
pomerol_mean = m.loc[m["Appellation"].str.strip() == "Pomerol", "mean_Robert"].iloc[
0
]
assert pomerol_mean == 0
def test_fill_missing_scores(cleaning_raw: Cleaning):
cleaning_raw._vins["Appellation"] = cleaning_raw._vins["Appellation"].str.strip()
cleaning_raw.drop_empty_appellation()
filled = cleaning_raw.fill_missing_scores().getVins()
for col in cleaning_raw.SCORE_COLS:
assert filled[col].isna().sum() == 0
pauillac_robert = filled[filled["Appellation"] == "Pauillac"]["Robert"]
assert (pauillac_robert == 95.0).all()
def test_encode_appellation(cleaning_raw: Cleaning):
cleaning_raw._vins["Appellation"] = cleaning_raw._vins["Appellation"].str.strip()
out = (
cleaning_raw.drop_empty_appellation()
.fill_missing_scores()
.encode_appellation()
.getVins()
)
assert "Appellation" not in out.columns
assert "Pauillac" in out.columns
assert int(out.loc[0, "Pauillac"]) == 1

0
tests/test_scraper.py Normal file → Executable file
View File