mirror of
https://github.com/guezoloic/millesima_projetS6.git
synced 2026-03-29 03:23:47 +00:00
Compare commits
10 Commits
exo7-loic
...
0f6eb856c6
| Author | SHA1 | Date | |
|---|---|---|---|
| 0f6eb856c6 | |||
| d62145e250 | |||
| 829c303e78 | |||
| b584f9a301 | |||
| 547c7ec4c1 | |||
| 0aa765d6a0 | |||
| 8a357abe86 | |||
|
|
2f5af5aabf | ||
|
|
0182bbbf20 | ||
|
|
8cae082344 |
15
pyproject.toml
Normal file
15
pyproject.toml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
[project]
|
||||||
|
name = "projet-millesima-s6"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"requests==2.32.5",
|
||||||
|
"beautifulsoup4==4.14.3",
|
||||||
|
"pandas==2.3.3",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
test = ["pytest==8.4.2", "requests-mock==1.12.1"]
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["setuptools"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
@@ -1,3 +0,0 @@
|
|||||||
requests>=2.32.5
|
|
||||||
requests-mock>=1.12.1
|
|
||||||
beautifulsoup4>=4.14.3
|
|
||||||
Binary file not shown.
20
src/main.py
Executable file
20
src/main.py
Executable file
@@ -0,0 +1,20 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
from os import getcwd
|
||||||
|
from os.path import normpath, join
|
||||||
|
from sys import argv
|
||||||
|
from pandas import read_csv, DataFrame
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
if len(argv) != 2:
|
||||||
|
raise ValueError(f"{argv[0]} <filename.csv>")
|
||||||
|
|
||||||
|
path: str = normpath(join(getcwd(), argv[1]))
|
||||||
|
db: DataFrame = read_csv(path)
|
||||||
|
print(db.all())
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
try:
|
||||||
|
main()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERREUR: {e}")
|
||||||
46
main.py → src/scraper.py
Normal file → Executable file
46
main.py → src/scraper.py
Normal file → Executable file
@@ -1,3 +1,6 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
from sys import argv
|
||||||
from typing import cast
|
from typing import cast
|
||||||
from requests import HTTPError, Response, Session
|
from requests import HTTPError, Response, Session
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
@@ -6,8 +9,8 @@ from json import JSONDecodeError, loads
|
|||||||
|
|
||||||
|
|
||||||
class _ScraperData:
|
class _ScraperData:
|
||||||
"""_summary_
|
"""_summary_"""
|
||||||
"""
|
|
||||||
def __init__(self, data: dict[str, object]) -> None:
|
def __init__(self, data: dict[str, object]) -> None:
|
||||||
"""_summary_
|
"""_summary_
|
||||||
|
|
||||||
@@ -121,7 +124,7 @@ class _ScraperData:
|
|||||||
|
|
||||||
val = cast(str, app_dict.get("value")).rstrip("+").split("-")
|
val = cast(str, app_dict.get("value")).rstrip("+").split("-")
|
||||||
if len(val) > 1 and val[1] != "":
|
if len(val) > 1 and val[1] != "":
|
||||||
val[0] = str((int(val[0]) + int(val[1])) / 2)
|
val[0] = str(round((float(val[0]) + float(val[1])) / 2, 1))
|
||||||
|
|
||||||
return val[0]
|
return val[0]
|
||||||
return None
|
return None
|
||||||
@@ -171,6 +174,18 @@ class Scraper:
|
|||||||
# Très utile pour éviter de renvoyer toujours les mêmes handshake
|
# Très utile pour éviter de renvoyer toujours les mêmes handshake
|
||||||
# TCP et d'avoir toujours une connexion constante avec le server
|
# TCP et d'avoir toujours une connexion constante avec le server
|
||||||
self._session: Session = Session()
|
self._session: Session = Session()
|
||||||
|
# Crée une "fausse carte d'identité" pour éviter que le site nous
|
||||||
|
# bloque car on serait des robots
|
||||||
|
self._session.headers.update(
|
||||||
|
{
|
||||||
|
"User-Agent":
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
|
||||||
|
AppleWebKit/537.36 (KHTML, like Gecko) \
|
||||||
|
Chrome/122.0.0.0 Safari/537.36",
|
||||||
|
"Accept-Language":
|
||||||
|
"fr-FR,fr;q=0.9,en;q=0.8",
|
||||||
|
}
|
||||||
|
)
|
||||||
# Système de cache pour éviter de solliciter le serveur inutilement
|
# Système de cache pour éviter de solliciter le serveur inutilement
|
||||||
self._latest_request: tuple[(str, Response)] | None = None
|
self._latest_request: tuple[(str, Response)] | None = None
|
||||||
self._latest_soups: OrderedDict[str, BeautifulSoup] = OrderedDict[
|
self._latest_soups: OrderedDict[str, BeautifulSoup] = OrderedDict[
|
||||||
@@ -191,7 +206,8 @@ class Scraper:
|
|||||||
HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx).
|
HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx).
|
||||||
"""
|
"""
|
||||||
target_url: str = self._url + subdir.lstrip("/")
|
target_url: str = self._url + subdir.lstrip("/")
|
||||||
response: Response = self._session.get(url=target_url, timeout=10)
|
# envoyer une requête GET sur la page si erreur, renvoie un raise
|
||||||
|
response: Response = self._session.get(url=target_url, timeout=30)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
return response
|
return response
|
||||||
|
|
||||||
@@ -291,7 +307,7 @@ class Scraper:
|
|||||||
|
|
||||||
return _ScraperData(cast(dict[str, object], current_data))
|
return _ScraperData(cast(dict[str, object], current_data))
|
||||||
|
|
||||||
def _geturlproductslist(self, subdir: str):
|
def _geturlproductslist(self, subdir: str) -> list[str] | None:
|
||||||
"""_summary_
|
"""_summary_
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -315,20 +331,22 @@ class Scraper:
|
|||||||
except (JSONDecodeError, HTTPError):
|
except (JSONDecodeError, HTTPError):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def getvins(self, subdir: str, filename: str):
|
def getvins(self, subdir: str, filename: str) -> None:
|
||||||
"""_summary_
|
"""_summary_
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
subdir (str): _description_
|
subdir (str): _description_
|
||||||
filename (str): _description_
|
filename (str): _description_
|
||||||
"""
|
"""
|
||||||
with open(filename, "a") as f:
|
with open(filename, "w") as f:
|
||||||
cache: set[str] = set[str]()
|
cache: set[str] = set[str]()
|
||||||
page = 0
|
page = 0
|
||||||
|
_ = f.write("Appellation,Robert,Robinson,Suckling,Prix\n")
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
page += 1
|
page += 1
|
||||||
products_list = self._geturlproductslist(f"{subdir}?page={page}")
|
products_list: list[str] | None = \
|
||||||
|
self._geturlproductslist(f"{subdir}?page={page}")
|
||||||
|
|
||||||
if not products_list:
|
if not products_list:
|
||||||
break
|
break
|
||||||
@@ -353,5 +371,15 @@ class Scraper:
|
|||||||
f.flush()
|
f.flush()
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
if len(argv) != 2:
|
||||||
|
raise ValueError(f"{argv[0]} <sous-url>")
|
||||||
|
scraper: Scraper = Scraper()
|
||||||
|
scraper.getvins(argv[1], "donnee.csv")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
Scraper().getvins("bordeaux.html", "donnee.csv")
|
try:
|
||||||
|
main()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERREUR: {e}")
|
||||||
0
tests/test_main.py
Normal file
0
tests/test_main.py
Normal file
@@ -2,7 +2,7 @@ from json import dumps
|
|||||||
from unittest.mock import patch, mock_open
|
from unittest.mock import patch, mock_open
|
||||||
import pytest
|
import pytest
|
||||||
from requests_mock import Mocker
|
from requests_mock import Mocker
|
||||||
from main import Scraper
|
from scraper import Scraper
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
@@ -138,7 +138,7 @@ def mock_site():
|
|||||||
"isSpirit": False,
|
"isSpirit": False,
|
||||||
},
|
},
|
||||||
"note_js": {
|
"note_js": {
|
||||||
"valueId": "93-94",
|
"valueId": "93-94.5",
|
||||||
"name": "J. cherazade",
|
"name": "J. cherazade",
|
||||||
"value": "93-94",
|
"value": "93-94",
|
||||||
"isSpirit": False,
|
"isSpirit": False,
|
||||||
Reference in New Issue
Block a user