Files
archived-millesima-projetS6/scraper/index.html
2026-03-04 11:53:20 +00:00

1871 lines
98 KiB
HTML

<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="prev" href="..">
<link rel="next" href="../scraperdata/">
<link rel="icon" href="../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.6.23">
<title>Scraper - Projet Millesima S6</title>
<link rel="stylesheet" href="../assets/stylesheets/main.84d31ad4.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<link rel="stylesheet" href="../assets/_mkdocstrings.css">
<script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#scraper" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href=".." title="Projet Millesima S6" class="md-header__button md-logo" aria-label="Projet Millesima S6" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
Projet Millesima S6
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
Scraper
</span>
</div>
</div>
</div>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
<label class="md-header__button md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
</label>
<div class="md-search" data-md-component="search" role="dialog">
<label class="md-search__overlay" for="__search"></label>
<div class="md-search__inner" role="search">
<form class="md-search__form" name="search">
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
<label class="md-search__icon md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
</label>
<nav class="md-search__options" aria-label="Search">
<button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
</button>
</nav>
</form>
<div class="md-search__output">
<div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
<div class="md-search-result" data-md-component="search-result">
<div class="md-search-result__meta">
Initializing search
</div>
<ol class="md-search-result__list" role="presentation"></ol>
</div>
</div>
</div>
</div>
</div>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href=".." title="Projet Millesima S6" class="md-nav__button md-logo" aria-label="Projet Millesima S6" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
Projet Millesima S6
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href=".." class="md-nav__link">
<span class="md-ellipsis">
Millesima
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
Scraper
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
Scraper
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#scraper.Scraper" class="md-nav__link">
<span class="md-ellipsis">
Scraper
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#scraper.Scraper.__init__" class="md-nav__link">
<span class="md-ellipsis">
__init__
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#scraper.Scraper.getjsondata" class="md-nav__link">
<span class="md-ellipsis">
getjsondata
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#scraper.Scraper.getresponse" class="md-nav__link">
<span class="md-ellipsis">
getresponse
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#scraper.Scraper.getsoup" class="md-nav__link">
<span class="md-ellipsis">
getsoup
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#scraper.Scraper.getvins" class="md-nav__link">
<span class="md-ellipsis">
getvins
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../scraperdata/" class="md-nav__link">
<span class="md-ellipsis">
_ScraperData
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#scraper.Scraper" class="md-nav__link">
<span class="md-ellipsis">
Scraper
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#scraper.Scraper.__init__" class="md-nav__link">
<span class="md-ellipsis">
__init__
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#scraper.Scraper.getjsondata" class="md-nav__link">
<span class="md-ellipsis">
getjsondata
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#scraper.Scraper.getresponse" class="md-nav__link">
<span class="md-ellipsis">
getresponse
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#scraper.Scraper.getsoup" class="md-nav__link">
<span class="md-ellipsis">
getsoup
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#scraper.Scraper.getvins" class="md-nav__link">
<span class="md-ellipsis">
getvins
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="scraper">Scraper</h1>
<div class="doc doc-object doc-class">
<a id="scraper.Scraper"></a>
<div class="doc doc-contents first">
<p>Client HTTP optimisé pour le scraping de millesima.fr.</p>
<p>Gère la session persistante, les headers de navigation et un cache double
pour optimiser les performances et la discrétion.</p>
<details class="mkdocstrings-source">
<summary>Source code in <code>site-packages/scraper.py</code></summary>
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">222</span>
<span class="normal">223</span>
<span class="normal">224</span>
<span class="normal">225</span>
<span class="normal">226</span>
<span class="normal">227</span>
<span class="normal">228</span>
<span class="normal">229</span>
<span class="normal">230</span>
<span class="normal">231</span>
<span class="normal">232</span>
<span class="normal">233</span>
<span class="normal">234</span>
<span class="normal">235</span>
<span class="normal">236</span>
<span class="normal">237</span>
<span class="normal">238</span>
<span class="normal">239</span>
<span class="normal">240</span>
<span class="normal">241</span>
<span class="normal">242</span>
<span class="normal">243</span>
<span class="normal">244</span>
<span class="normal">245</span>
<span class="normal">246</span>
<span class="normal">247</span>
<span class="normal">248</span>
<span class="normal">249</span>
<span class="normal">250</span>
<span class="normal">251</span>
<span class="normal">252</span>
<span class="normal">253</span>
<span class="normal">254</span>
<span class="normal">255</span>
<span class="normal">256</span>
<span class="normal">257</span>
<span class="normal">258</span>
<span class="normal">259</span>
<span class="normal">260</span>
<span class="normal">261</span>
<span class="normal">262</span>
<span class="normal">263</span>
<span class="normal">264</span>
<span class="normal">265</span>
<span class="normal">266</span>
<span class="normal">267</span>
<span class="normal">268</span>
<span class="normal">269</span>
<span class="normal">270</span>
<span class="normal">271</span>
<span class="normal">272</span>
<span class="normal">273</span>
<span class="normal">274</span>
<span class="normal">275</span>
<span class="normal">276</span>
<span class="normal">277</span>
<span class="normal">278</span>
<span class="normal">279</span>
<span class="normal">280</span>
<span class="normal">281</span>
<span class="normal">282</span>
<span class="normal">283</span>
<span class="normal">284</span>
<span class="normal">285</span>
<span class="normal">286</span>
<span class="normal">287</span>
<span class="normal">288</span>
<span class="normal">289</span>
<span class="normal">290</span>
<span class="normal">291</span>
<span class="normal">292</span>
<span class="normal">293</span>
<span class="normal">294</span>
<span class="normal">295</span>
<span class="normal">296</span>
<span class="normal">297</span>
<span class="normal">298</span>
<span class="normal">299</span>
<span class="normal">300</span>
<span class="normal">301</span>
<span class="normal">302</span>
<span class="normal">303</span>
<span class="normal">304</span>
<span class="normal">305</span>
<span class="normal">306</span>
<span class="normal">307</span>
<span class="normal">308</span>
<span class="normal">309</span>
<span class="normal">310</span>
<span class="normal">311</span>
<span class="normal">312</span>
<span class="normal">313</span>
<span class="normal">314</span>
<span class="normal">315</span>
<span class="normal">316</span>
<span class="normal">317</span>
<span class="normal">318</span>
<span class="normal">319</span>
<span class="normal">320</span>
<span class="normal">321</span>
<span class="normal">322</span>
<span class="normal">323</span>
<span class="normal">324</span>
<span class="normal">325</span>
<span class="normal">326</span>
<span class="normal">327</span>
<span class="normal">328</span>
<span class="normal">329</span>
<span class="normal">330</span>
<span class="normal">331</span>
<span class="normal">332</span>
<span class="normal">333</span>
<span class="normal">334</span>
<span class="normal">335</span>
<span class="normal">336</span>
<span class="normal">337</span>
<span class="normal">338</span>
<span class="normal">339</span>
<span class="normal">340</span>
<span class="normal">341</span>
<span class="normal">342</span>
<span class="normal">343</span>
<span class="normal">344</span>
<span class="normal">345</span>
<span class="normal">346</span>
<span class="normal">347</span>
<span class="normal">348</span>
<span class="normal">349</span>
<span class="normal">350</span>
<span class="normal">351</span>
<span class="normal">352</span>
<span class="normal">353</span>
<span class="normal">354</span>
<span class="normal">355</span>
<span class="normal">356</span>
<span class="normal">357</span>
<span class="normal">358</span>
<span class="normal">359</span>
<span class="normal">360</span>
<span class="normal">361</span>
<span class="normal">362</span>
<span class="normal">363</span>
<span class="normal">364</span>
<span class="normal">365</span>
<span class="normal">366</span>
<span class="normal">367</span>
<span class="normal">368</span>
<span class="normal">369</span>
<span class="normal">370</span>
<span class="normal">371</span>
<span class="normal">372</span>
<span class="normal">373</span>
<span class="normal">374</span>
<span class="normal">375</span>
<span class="normal">376</span>
<span class="normal">377</span>
<span class="normal">378</span>
<span class="normal">379</span>
<span class="normal">380</span>
<span class="normal">381</span>
<span class="normal">382</span>
<span class="normal">383</span>
<span class="normal">384</span>
<span class="normal">385</span>
<span class="normal">386</span>
<span class="normal">387</span>
<span class="normal">388</span>
<span class="normal">389</span>
<span class="normal">390</span>
<span class="normal">391</span>
<span class="normal">392</span>
<span class="normal">393</span>
<span class="normal">394</span>
<span class="normal">395</span>
<span class="normal">396</span>
<span class="normal">397</span>
<span class="normal">398</span>
<span class="normal">399</span>
<span class="normal">400</span>
<span class="normal">401</span>
<span class="normal">402</span>
<span class="normal">403</span>
<span class="normal">404</span>
<span class="normal">405</span>
<span class="normal">406</span>
<span class="normal">407</span>
<span class="normal">408</span>
<span class="normal">409</span>
<span class="normal">410</span>
<span class="normal">411</span>
<span class="normal">412</span>
<span class="normal">413</span>
<span class="normal">414</span>
<span class="normal">415</span>
<span class="normal">416</span>
<span class="normal">417</span>
<span class="normal">418</span>
<span class="normal">419</span>
<span class="normal">420</span>
<span class="normal">421</span>
<span class="normal">422</span>
<span class="normal">423</span>
<span class="normal">424</span>
<span class="normal">425</span>
<span class="normal">426</span>
<span class="normal">427</span>
<span class="normal">428</span>
<span class="normal">429</span>
<span class="normal">430</span>
<span class="normal">431</span>
<span class="normal">432</span>
<span class="normal">433</span>
<span class="normal">434</span>
<span class="normal">435</span>
<span class="normal">436</span>
<span class="normal">437</span>
<span class="normal">438</span>
<span class="normal">439</span>
<span class="normal">440</span>
<span class="normal">441</span>
<span class="normal">442</span>
<span class="normal">443</span>
<span class="normal">444</span>
<span class="normal">445</span>
<span class="normal">446</span>
<span class="normal">447</span>
<span class="normal">448</span>
<span class="normal">449</span>
<span class="normal">450</span>
<span class="normal">451</span>
<span class="normal">452</span>
<span class="normal">453</span>
<span class="normal">454</span>
<span class="normal">455</span>
<span class="normal">456</span>
<span class="normal">457</span>
<span class="normal">458</span>
<span class="normal">459</span>
<span class="normal">460</span>
<span class="normal">461</span>
<span class="normal">462</span>
<span class="normal">463</span>
<span class="normal">464</span>
<span class="normal">465</span>
<span class="normal">466</span>
<span class="normal">467</span>
<span class="normal">468</span>
<span class="normal">469</span>
<span class="normal">470</span>
<span class="normal">471</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">class</span><span class="w"> </span><span class="nc">Scraper</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Client HTTP optimisé pour le scraping de millesima.fr.</span>
<span class="sd"> Gère la session persistante, les headers de navigation et un cache double</span>
<span class="sd"> pour optimiser les performances et la discrétion.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Initialise l&#39;infrastructure de navigation:</span>
<span class="sd"> - créer une session pour éviter de faire un handshake pour chaque requête</span>
<span class="sd"> - ajout d&#39;un header pour éviter le blocage de l&#39;accès au site</span>
<span class="sd"> - ajout d&#39;un système de cache</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_url</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;https://www.millesima.fr/&quot;</span>
<span class="c1"># Très utile pour éviter de renvoyer toujours les mêmes handshake</span>
<span class="c1"># TCP et d&#39;avoir toujours une connexion constante avec le server</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_session</span><span class="p">:</span> <span class="n">Session</span> <span class="o">=</span> <span class="n">Session</span><span class="p">()</span>
<span class="c1"># Crée une &quot;fausse carte d&#39;identité&quot; pour éviter que le site nous</span>
<span class="c1"># bloque car on serait des robots</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_session</span><span class="o">.</span><span class="n">headers</span><span class="o">.</span><span class="n">update</span><span class="p">(</span>
<span class="p">{</span>
<span class="s2">&quot;User-Agent&quot;</span><span class="p">:</span> <span class="s2">&quot;Mozilla/5.0 (Windows NT 10.0; Win64; x64) </span><span class="se">\</span>
<span class="s2"> AppleWebKit/537.36 (KHTML, like Gecko) </span><span class="se">\</span>
<span class="s2"> Chrome/122.0.0.0 Safari/537.36&quot;</span><span class="p">,</span>
<span class="s2">&quot;Accept-Language&quot;</span><span class="p">:</span> <span class="s2">&quot;fr-FR,fr;q=0.9,en;q=0.8&quot;</span><span class="p">,</span>
<span class="p">}</span>
<span class="p">)</span>
<span class="c1"># Système de cache pour éviter de solliciter le serveur inutilement</span>
<span class="c1"># utilise pour _request</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_latest_request</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">[(</span><span class="nb">str</span><span class="p">,</span> <span class="n">Response</span><span class="p">)]</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span>
<span class="c1"># utilise pour getsoup</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_latest_soups</span><span class="p">:</span> <span class="n">OrderedDict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">BeautifulSoup</span><span class="p">]</span> <span class="o">=</span> <span class="n">OrderedDict</span><span class="p">[</span>
<span class="nb">str</span><span class="p">,</span> <span class="n">BeautifulSoup</span>
<span class="p">]()</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_request</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">subdir</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Response</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Effectue une requête GET sur le serveur Millesima.</span>
<span class="sd"> Args:</span>
<span class="sd"> subdir (str): Le sous-répertoire ou chemin de l&#39;URL (ex: &quot;/vins&quot;).</span>
<span class="sd"> Returns:</span>
<span class="sd"> Response: L&#39;objet réponse de la requête.</span>
<span class="sd"> Raises:</span>
<span class="sd"> HTTPError: Si le serveur renvoie un code d&#39;erreur (4xx, 5xx).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">target_url</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_url</span> <span class="o">+</span> <span class="n">subdir</span><span class="o">.</span><span class="n">lstrip</span><span class="p">(</span><span class="s2">&quot;/&quot;</span><span class="p">)</span>
<span class="c1"># envoyer une requête GET sur la page si erreur, renvoie un raise</span>
<span class="n">response</span><span class="p">:</span> <span class="n">Response</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_session</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">url</span><span class="o">=</span><span class="n">target_url</span><span class="p">,</span> <span class="n">timeout</span><span class="o">=</span><span class="mi">30</span><span class="p">)</span>
<span class="n">response</span><span class="o">.</span><span class="n">raise_for_status</span><span class="p">()</span>
<span class="k">return</span> <span class="n">response</span>
<span class="k">def</span><span class="w"> </span><span class="nf">getresponse</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">subdir</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;&quot;</span><span class="p">,</span> <span class="n">use_cache</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Response</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Récupère la réponse d&#39;une page, en utilisant le cache si possible.</span>
<span class="sd"> Args:</span>
<span class="sd"> subdir (str, optional): Le chemin de la page.</span>
<span class="sd"> use_cache (bool, optional): Utilise la donnée deja sauvegarder ou</span>
<span class="sd"> écrase la donnée utilisé avec la nouvelle</span>
<span class="sd"> Returns:</span>
<span class="sd"> Response: L&#39;objet réponse (cache ou nouvelle requête).</span>
<span class="sd"> Raises:</span>
<span class="sd"> HTTPError: Si le serveur renvoie un code d&#39;erreur (4xx, 5xx).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># si dans le cache, latest_request existe</span>
<span class="k">if</span> <span class="n">use_cache</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">_latest_request</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">rq_subdir</span><span class="p">,</span> <span class="n">rq_response</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_latest_request</span>
<span class="c1"># si c&#39;est la meme requete et que use_cache est true,</span>
<span class="c1"># on renvoie celle enregistrer</span>
<span class="k">if</span> <span class="n">subdir</span> <span class="o">==</span> <span class="n">rq_subdir</span><span class="p">:</span>
<span class="k">return</span> <span class="n">rq_response</span>
<span class="n">request</span><span class="p">:</span> <span class="n">Response</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_request</span><span class="p">(</span><span class="n">subdir</span><span class="p">)</span>
<span class="c1"># on recrée la structure pour le systeme de cache si activer</span>
<span class="k">if</span> <span class="n">use_cache</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_latest_request</span> <span class="o">=</span> <span class="p">(</span><span class="n">subdir</span><span class="p">,</span> <span class="n">request</span><span class="p">)</span>
<span class="k">return</span> <span class="n">request</span>
<span class="k">def</span><span class="w"> </span><span class="nf">getsoup</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">subdir</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">use_cache</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">BeautifulSoup</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Récupère le contenu HTML d&#39;une page et le transforme en objet BeautifulSoup.</span>
<span class="sd"> Args:</span>
<span class="sd"> subdir (str, optional): Le chemin de la page.</span>
<span class="sd"> Returns:</span>
<span class="sd"> BeautifulSoup: L&#39;objet parsé pour extraction de données.</span>
<span class="sd"> Raises:</span>
<span class="sd"> HTTPError: Si le serveur renvoie un code d&#39;erreur (4xx, 5xx).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">use_cache</span> <span class="ow">and</span> <span class="n">subdir</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_latest_soups</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_latest_soups</span><span class="p">[</span><span class="n">subdir</span><span class="p">]</span>
<span class="n">markup</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">getresponse</span><span class="p">(</span><span class="n">subdir</span><span class="p">)</span><span class="o">.</span><span class="n">text</span>
<span class="n">soup</span><span class="p">:</span> <span class="n">BeautifulSoup</span> <span class="o">=</span> <span class="n">BeautifulSoup</span><span class="p">(</span><span class="n">markup</span><span class="p">,</span> <span class="n">features</span><span class="o">=</span><span class="s2">&quot;html.parser&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">use_cache</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_latest_soups</span><span class="p">[</span><span class="n">subdir</span><span class="p">]</span> <span class="o">=</span> <span class="n">soup</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_latest_soups</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">10</span><span class="p">:</span>
<span class="n">_</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_latest_soups</span><span class="o">.</span><span class="n">popitem</span><span class="p">(</span><span class="n">last</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="k">return</span> <span class="n">soup</span>
<span class="k">def</span><span class="w"> </span><span class="nf">getjsondata</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">subdir</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="nb">id</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;__NEXT_DATA__&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">_ScraperData</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extrait les données JSON contenues dans la balise __NEXT_DATA__ du site.</span>
<span class="sd"> Args:</span>
<span class="sd"> subdir (str): Le chemin de la page.</span>
<span class="sd"> id (str, optional): L&#39;identifiant de la balise script.</span>
<span class="sd"> Raises:</span>
<span class="sd"> HTTPError: Erreur renvoyée par le serveur (4xx, 5xx).</span>
<span class="sd"> JSONDecodeError: Si le contenu de la balise n&#39;est pas un JSON valide.</span>
<span class="sd"> ValueError: Si les clés &#39;props&#39; ou &#39;pageProps&#39; sont absentes.</span>
<span class="sd"> Returns:</span>
<span class="sd"> _ScraperData: Instance contenant les données extraites.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">soup</span><span class="p">:</span> <span class="n">BeautifulSoup</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">getsoup</span><span class="p">(</span><span class="n">subdir</span><span class="p">)</span>
<span class="n">script</span><span class="p">:</span> <span class="n">Tag</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="n">soup</span><span class="o">.</span><span class="n">find</span><span class="p">(</span><span class="s2">&quot;script&quot;</span><span class="p">,</span> <span class="nb">id</span><span class="o">=</span><span class="nb">id</span><span class="p">)</span>
<span class="k">if</span> <span class="n">script</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="ow">not</span> <span class="n">script</span><span class="o">.</span><span class="n">string</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;le script id=</span><span class="si">{</span><span class="nb">id</span><span class="si">}</span><span class="s2"> est introuvable&quot;</span><span class="p">)</span>
<span class="n">current_data</span><span class="p">:</span> <span class="nb">object</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="nb">object</span><span class="p">,</span> <span class="n">loads</span><span class="p">(</span><span class="n">script</span><span class="o">.</span><span class="n">string</span><span class="p">))</span>
<span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;props&quot;</span><span class="p">,</span> <span class="s2">&quot;pageProps&quot;</span><span class="p">]:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">current_data</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">current_data</span><span class="p">:</span>
<span class="n">current_data</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="nb">object</span><span class="p">,</span> <span class="n">current_data</span><span class="p">[</span><span class="n">key</span><span class="p">])</span>
<span class="k">continue</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Clé manquante dans le JSON : </span><span class="si">{</span><span class="n">key</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">_ScraperData</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">object</span><span class="p">],</span> <span class="n">current_data</span><span class="p">))</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_geturlproductslist</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">subdir</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">list</span><span class="p">[</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]</span> <span class="o">|</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Récupère la liste des produits d&#39;une page de catégorie.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">data</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">object</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">getjsondata</span><span class="p">(</span><span class="n">subdir</span><span class="p">)</span><span class="o">.</span><span class="n">getdata</span><span class="p">()</span>
<span class="k">for</span> <span class="n">element</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;initialReduxState&quot;</span><span class="p">,</span> <span class="s2">&quot;categ&quot;</span><span class="p">,</span> <span class="s2">&quot;content&quot;</span><span class="p">]:</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">object</span><span class="p">],</span> <span class="n">data</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">element</span><span class="p">))</span>
<span class="n">products</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span>
<span class="nb">list</span><span class="p">[</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span> <span class="n">data</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;products&quot;</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">products</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span>
<span class="k">return</span> <span class="n">products</span>
<span class="k">except</span> <span class="p">(</span><span class="n">JSONDecodeError</span><span class="p">,</span> <span class="n">HTTPError</span><span class="p">):</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_writevins</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">cache</span><span class="p">:</span> <span class="nb">set</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">product</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> <span class="n">f</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;_summary_</span>
<span class="sd"> Args:</span>
<span class="sd"> cache (set[str]): _description_</span>
<span class="sd"> product (dict): _description_</span>
<span class="sd"> f (Any): _description_</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">product</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span>
<span class="n">link</span><span class="p">:</span> <span class="n">Any</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="n">product</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;seoKeyword&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">link</span> <span class="ow">and</span> <span class="n">link</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">cache</span><span class="p">:</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">infos</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">getjsondata</span><span class="p">(</span><span class="n">link</span><span class="p">)</span><span class="o">.</span><span class="n">informations</span><span class="p">()</span>
<span class="n">_</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">infos</span> <span class="o">+</span> <span class="s2">&quot;</span><span class="se">\n</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">cache</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">link</span><span class="p">)</span>
<span class="k">except</span> <span class="p">(</span><span class="n">JSONDecodeError</span><span class="p">,</span> <span class="n">HTTPError</span><span class="p">)</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Erreur sur le produit </span><span class="si">{</span><span class="n">link</span><span class="si">}</span><span class="s2">: </span><span class="si">{</span><span class="n">e</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">getvins</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">subdir</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">filename</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">reset</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Scrape toutes les pages d&#39;une catégorie et sauvegarde en CSV.</span>
<span class="sd"> Args:</span>
<span class="sd"> subdir (str): La catégorie (ex: &#39;/vins-rouges&#39;).</span>
<span class="sd"> filename (str): Nom du fichier de sortie (ex: &#39;vins.csv&#39;).</span>
<span class="sd"> reset (bool): (Optionnel) pour réinitialiser le processus.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># mode d&#39;écriture fichier</span>
<span class="n">mode</span><span class="p">:</span> <span class="n">Literal</span><span class="p">[</span><span class="s2">&quot;w&quot;</span><span class="p">,</span> <span class="s2">&quot;a+&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;w&quot;</span> <span class="k">if</span> <span class="n">reset</span> <span class="k">else</span> <span class="s2">&quot;a+&quot;</span>
<span class="c1"># titre</span>
<span class="n">title</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;Appellation,Robert,Robinson,Suckling,Prix</span><span class="se">\n</span><span class="s2">&quot;</span>
<span class="c1"># page du début</span>
<span class="n">page</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span>
<span class="c1"># le set qui sert de cache</span>
<span class="n">cache</span><span class="p">:</span> <span class="nb">set</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="nb">set</span><span class="p">[</span><span class="nb">str</span><span class="p">]()</span>
<span class="n">custom_format</span> <span class="o">=</span> <span class="s2">&quot;</span><span class="si">{l_bar}</span><span class="s2"> </span><span class="si">{bar:20}</span><span class="s2"> </span><span class="si">{r_bar}</span><span class="s2">&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">reset</span><span class="p">:</span>
<span class="c1"># appelle la fonction pour load le cache, si il existe</span>
<span class="c1"># pas, il utilise les variables de base sinon il override</span>
<span class="c1"># toute les variables pour continuer et pas recommencer le</span>
<span class="c1"># processus en entier.</span>
<span class="n">serializable</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">set</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="n">loadstate</span><span class="p">()</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">serializable</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span>
<span class="c1"># override la page et le cache</span>
<span class="n">page</span><span class="p">,</span> <span class="n">cache</span> <span class="o">=</span> <span class="n">serializable</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">filename</span><span class="p">,</span> <span class="n">mode</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
<span class="c1"># check si le titre est bien présent au début du buffer</span>
<span class="c1"># sinon il l&#39;ecrit, petit bug potentiel, a+ ecrit tout le</span>
<span class="c1"># temps a la fin du buffer, si on a ecrit des choses avant</span>
<span class="c1"># le titre sera apres ces données mais on part du principe</span>
<span class="c1"># que personne va toucher le fichier.</span>
<span class="n">_</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">seek</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">SEEK_SET</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="p">(</span><span class="n">f</span><span class="o">.</span><span class="n">read</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">title</span><span class="p">))</span> <span class="o">==</span> <span class="n">title</span><span class="p">):</span>
<span class="n">_</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">title</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">_</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">seek</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">SEEK_END</span><span class="p">)</span>
<span class="k">while</span> <span class="kc">True</span><span class="p">:</span>
<span class="n">products_list</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_geturlproductslist</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">subdir</span><span class="si">}</span><span class="s2">?page=</span><span class="si">{</span><span class="n">page</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">products_list</span><span class="p">:</span>
<span class="k">break</span>
<span class="n">pbar</span><span class="p">:</span> <span class="n">tqdm</span><span class="p">[</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]</span> <span class="o">=</span> <span class="n">tqdm</span><span class="p">(</span>
<span class="n">products_list</span><span class="p">,</span> <span class="n">bar_format</span><span class="o">=</span><span class="n">custom_format</span>
<span class="p">)</span>
<span class="k">for</span> <span class="n">product</span> <span class="ow">in</span> <span class="n">pbar</span><span class="p">:</span>
<span class="n">keyword</span> <span class="o">=</span> <span class="n">product</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;seoKeyword&quot;</span><span class="p">,</span> <span class="s2">&quot;Inconnu&quot;</span><span class="p">)[:</span><span class="mi">40</span><span class="p">]</span>
<span class="n">pbar</span><span class="o">.</span><span class="n">set_description</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;Page: </span><span class="si">{</span><span class="n">page</span><span class="si">:</span><span class="s2">&lt;3</span><span class="si">}</span><span class="s2"> | Product: </span><span class="si">{</span><span class="n">keyword</span><span class="si">:</span><span class="s2">&lt;40</span><span class="si">}</span><span class="s2">&quot;</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_writevins</span><span class="p">(</span><span class="n">cache</span><span class="p">,</span> <span class="n">product</span><span class="p">,</span> <span class="n">f</span><span class="p">)</span>
<span class="n">page</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="k">except</span> <span class="p">(</span><span class="ne">Exception</span><span class="p">,</span> <span class="n">HTTPError</span><span class="p">,</span> <span class="ne">KeyboardInterrupt</span><span class="p">,</span> <span class="n">JSONDecodeError</span><span class="p">):</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">reset</span><span class="p">:</span>
<span class="n">savestate</span><span class="p">((</span><span class="n">page</span><span class="p">,</span> <span class="n">cache</span><span class="p">))</span>
</code></pre></div></td></tr></table></div>
</details>
<div class="doc doc-children">
<div class="doc doc-object doc-function">
<h2 id="scraper.Scraper.__init__" class="doc doc-heading">
<code class="highlight language-python"><span class="fm">__init__</span><span class="p">()</span></code>
</h2>
<div class="doc doc-contents ">
<p>Initialise l'infrastructure de navigation:</p>
<ul>
<li>créer une session pour éviter de faire un handshake pour chaque requête</li>
<li>ajout d'un header pour éviter le blocage de l'accès au site</li>
<li>ajout d'un système de cache</li>
</ul>
<details class="mkdocstrings-source">
<summary>Source code in <code>site-packages/scraper.py</code></summary>
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">230</span>
<span class="normal">231</span>
<span class="normal">232</span>
<span class="normal">233</span>
<span class="normal">234</span>
<span class="normal">235</span>
<span class="normal">236</span>
<span class="normal">237</span>
<span class="normal">238</span>
<span class="normal">239</span>
<span class="normal">240</span>
<span class="normal">241</span>
<span class="normal">242</span>
<span class="normal">243</span>
<span class="normal">244</span>
<span class="normal">245</span>
<span class="normal">246</span>
<span class="normal">247</span>
<span class="normal">248</span>
<span class="normal">249</span>
<span class="normal">250</span>
<span class="normal">251</span>
<span class="normal">252</span>
<span class="normal">253</span>
<span class="normal">254</span>
<span class="normal">255</span>
<span class="normal">256</span>
<span class="normal">257</span>
<span class="normal">258</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Initialise l&#39;infrastructure de navigation:</span>
<span class="sd"> - créer une session pour éviter de faire un handshake pour chaque requête</span>
<span class="sd"> - ajout d&#39;un header pour éviter le blocage de l&#39;accès au site</span>
<span class="sd"> - ajout d&#39;un système de cache</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_url</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;https://www.millesima.fr/&quot;</span>
<span class="c1"># Très utile pour éviter de renvoyer toujours les mêmes handshake</span>
<span class="c1"># TCP et d&#39;avoir toujours une connexion constante avec le server</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_session</span><span class="p">:</span> <span class="n">Session</span> <span class="o">=</span> <span class="n">Session</span><span class="p">()</span>
<span class="c1"># Crée une &quot;fausse carte d&#39;identité&quot; pour éviter que le site nous</span>
<span class="c1"># bloque car on serait des robots</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_session</span><span class="o">.</span><span class="n">headers</span><span class="o">.</span><span class="n">update</span><span class="p">(</span>
<span class="p">{</span>
<span class="s2">&quot;User-Agent&quot;</span><span class="p">:</span> <span class="s2">&quot;Mozilla/5.0 (Windows NT 10.0; Win64; x64) </span><span class="se">\</span>
<span class="s2"> AppleWebKit/537.36 (KHTML, like Gecko) </span><span class="se">\</span>
<span class="s2"> Chrome/122.0.0.0 Safari/537.36&quot;</span><span class="p">,</span>
<span class="s2">&quot;Accept-Language&quot;</span><span class="p">:</span> <span class="s2">&quot;fr-FR,fr;q=0.9,en;q=0.8&quot;</span><span class="p">,</span>
<span class="p">}</span>
<span class="p">)</span>
<span class="c1"># Système de cache pour éviter de solliciter le serveur inutilement</span>
<span class="c1"># utilise pour _request</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_latest_request</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">[(</span><span class="nb">str</span><span class="p">,</span> <span class="n">Response</span><span class="p">)]</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span>
<span class="c1"># utilise pour getsoup</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_latest_soups</span><span class="p">:</span> <span class="n">OrderedDict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">BeautifulSoup</span><span class="p">]</span> <span class="o">=</span> <span class="n">OrderedDict</span><span class="p">[</span>
<span class="nb">str</span><span class="p">,</span> <span class="n">BeautifulSoup</span>
<span class="p">]()</span>
</code></pre></div></td></tr></table></div>
</details>
</div>
</div>
<div class="doc doc-object doc-function">
<h2 id="scraper.Scraper.getjsondata" class="doc doc-heading">
<code class="highlight language-python"><span class="n">getjsondata</span><span class="p">(</span><span class="n">subdir</span><span class="p">,</span> <span class="nb">id</span><span class="o">=</span><span class="s1">&#39;__NEXT_DATA__&#39;</span><span class="p">)</span></code>
</h2>
<div class="doc doc-contents ">
<p>Extrait les données JSON contenues dans la balise <strong>NEXT_DATA</strong> du site.</p>
<p><span class="doc-section-title">Parameters:</span></p>
<table>
<thead>
<tr>
<th>Name</th>
<th>Type</th>
<th>Description</th>
<th>Default</th>
</tr>
</thead>
<tbody>
<tr class="doc-section-item">
<td>
<code>subdir</code>
</td>
<td>
<code><span title="str">str</span></code>
</td>
<td>
<div class="doc-md-description">
<p>Le chemin de la page.</p>
</div>
</td>
<td>
<em>required</em>
</td>
</tr>
<tr class="doc-section-item">
<td>
<code>id</code>
</td>
<td>
<code><span title="str">str</span></code>
</td>
<td>
<div class="doc-md-description">
<p>L'identifiant de la balise script.</p>
</div>
</td>
<td>
<code>&#39;__NEXT_DATA__&#39;</code>
</td>
</tr>
</tbody>
</table>
<p><span class="doc-section-title">Raises:</span></p>
<table>
<thead>
<tr>
<th>Type</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr class="doc-section-item">
<td>
<code><span title="requests.HTTPError">HTTPError</span></code>
</td>
<td>
<div class="doc-md-description">
<p>Erreur renvoyée par le serveur (4xx, 5xx).</p>
</div>
</td>
</tr>
<tr class="doc-section-item">
<td>
<code><span title="json.JSONDecodeError">JSONDecodeError</span></code>
</td>
<td>
<div class="doc-md-description">
<p>Si le contenu de la balise n'est pas un JSON valide.</p>
</div>
</td>
</tr>
<tr class="doc-section-item">
<td>
<code><span title="ValueError">ValueError</span></code>
</td>
<td>
<div class="doc-md-description">
<p>Si les clés 'props' ou 'pageProps' sont absentes.</p>
</div>
</td>
</tr>
</tbody>
</table>
<p><span class="doc-section-title">Returns:</span></p>
<table>
<thead>
<tr>
<th>Name</th> <th>Type</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr class="doc-section-item">
<td><code>_ScraperData</code></td> <td>
<code><a class="autorefs autorefs-internal" title="scraper._ScraperData" href="../scraperdata/#scraper._ScraperData">_ScraperData</a></code>
</td>
<td>
<div class="doc-md-description">
<p>Instance contenant les données extraites.</p>
</div>
</td>
</tr>
</tbody>
</table>
<details class="mkdocstrings-source">
<summary>Source code in <code>site-packages/scraper.py</code></summary>
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">339</span>
<span class="normal">340</span>
<span class="normal">341</span>
<span class="normal">342</span>
<span class="normal">343</span>
<span class="normal">344</span>
<span class="normal">345</span>
<span class="normal">346</span>
<span class="normal">347</span>
<span class="normal">348</span>
<span class="normal">349</span>
<span class="normal">350</span>
<span class="normal">351</span>
<span class="normal">352</span>
<span class="normal">353</span>
<span class="normal">354</span>
<span class="normal">355</span>
<span class="normal">356</span>
<span class="normal">357</span>
<span class="normal">358</span>
<span class="normal">359</span>
<span class="normal">360</span>
<span class="normal">361</span>
<span class="normal">362</span>
<span class="normal">363</span>
<span class="normal">364</span>
<span class="normal">365</span>
<span class="normal">366</span>
<span class="normal">367</span>
<span class="normal">368</span>
<span class="normal">369</span>
<span class="normal">370</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span><span class="w"> </span><span class="nf">getjsondata</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">subdir</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="nb">id</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;__NEXT_DATA__&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">_ScraperData</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extrait les données JSON contenues dans la balise __NEXT_DATA__ du site.</span>
<span class="sd"> Args:</span>
<span class="sd"> subdir (str): Le chemin de la page.</span>
<span class="sd"> id (str, optional): L&#39;identifiant de la balise script.</span>
<span class="sd"> Raises:</span>
<span class="sd"> HTTPError: Erreur renvoyée par le serveur (4xx, 5xx).</span>
<span class="sd"> JSONDecodeError: Si le contenu de la balise n&#39;est pas un JSON valide.</span>
<span class="sd"> ValueError: Si les clés &#39;props&#39; ou &#39;pageProps&#39; sont absentes.</span>
<span class="sd"> Returns:</span>
<span class="sd"> _ScraperData: Instance contenant les données extraites.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">soup</span><span class="p">:</span> <span class="n">BeautifulSoup</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">getsoup</span><span class="p">(</span><span class="n">subdir</span><span class="p">)</span>
<span class="n">script</span><span class="p">:</span> <span class="n">Tag</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="n">soup</span><span class="o">.</span><span class="n">find</span><span class="p">(</span><span class="s2">&quot;script&quot;</span><span class="p">,</span> <span class="nb">id</span><span class="o">=</span><span class="nb">id</span><span class="p">)</span>
<span class="k">if</span> <span class="n">script</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="ow">not</span> <span class="n">script</span><span class="o">.</span><span class="n">string</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;le script id=</span><span class="si">{</span><span class="nb">id</span><span class="si">}</span><span class="s2"> est introuvable&quot;</span><span class="p">)</span>
<span class="n">current_data</span><span class="p">:</span> <span class="nb">object</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="nb">object</span><span class="p">,</span> <span class="n">loads</span><span class="p">(</span><span class="n">script</span><span class="o">.</span><span class="n">string</span><span class="p">))</span>
<span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;props&quot;</span><span class="p">,</span> <span class="s2">&quot;pageProps&quot;</span><span class="p">]:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">current_data</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">current_data</span><span class="p">:</span>
<span class="n">current_data</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="nb">object</span><span class="p">,</span> <span class="n">current_data</span><span class="p">[</span><span class="n">key</span><span class="p">])</span>
<span class="k">continue</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Clé manquante dans le JSON : </span><span class="si">{</span><span class="n">key</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">_ScraperData</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">object</span><span class="p">],</span> <span class="n">current_data</span><span class="p">))</span>
</code></pre></div></td></tr></table></div>
</details>
</div>
</div>
<div class="doc doc-object doc-function">
<h2 id="scraper.Scraper.getresponse" class="doc doc-heading">
<code class="highlight language-python"><span class="n">getresponse</span><span class="p">(</span><span class="n">subdir</span><span class="o">=</span><span class="s1">&#39;&#39;</span><span class="p">,</span> <span class="n">use_cache</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></code>
</h2>
<div class="doc doc-contents ">
<p>Récupère la réponse d'une page, en utilisant le cache si possible.</p>
<p><span class="doc-section-title">Parameters:</span></p>
<table>
<thead>
<tr>
<th>Name</th>
<th>Type</th>
<th>Description</th>
<th>Default</th>
</tr>
</thead>
<tbody>
<tr class="doc-section-item">
<td>
<code>subdir</code>
</td>
<td>
<code><span title="str">str</span></code>
</td>
<td>
<div class="doc-md-description">
<p>Le chemin de la page.</p>
</div>
</td>
<td>
<code>&#39;&#39;</code>
</td>
</tr>
<tr class="doc-section-item">
<td>
<code>use_cache</code>
</td>
<td>
<code><span title="bool">bool</span></code>
</td>
<td>
<div class="doc-md-description">
<p>Utilise la donnée deja sauvegarder ou
écrase la donnée utilisé avec la nouvelle</p>
</div>
</td>
<td>
<code>True</code>
</td>
</tr>
</tbody>
</table>
<p><span class="doc-section-title">Returns:</span></p>
<table>
<thead>
<tr>
<th>Name</th> <th>Type</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr class="doc-section-item">
<td><code>Response</code></td> <td>
<code><span title="requests.Response">Response</span></code>
</td>
<td>
<div class="doc-md-description">
<p>L'objet réponse (cache ou nouvelle requête).</p>
</div>
</td>
</tr>
</tbody>
</table>
<p><span class="doc-section-title">Raises:</span></p>
<table>
<thead>
<tr>
<th>Type</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr class="doc-section-item">
<td>
<code><span title="requests.HTTPError">HTTPError</span></code>
</td>
<td>
<div class="doc-md-description">
<p>Si le serveur renvoie un code d'erreur (4xx, 5xx).</p>
</div>
</td>
</tr>
</tbody>
</table>
<details class="mkdocstrings-source">
<summary>Source code in <code>site-packages/scraper.py</code></summary>
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">279</span>
<span class="normal">280</span>
<span class="normal">281</span>
<span class="normal">282</span>
<span class="normal">283</span>
<span class="normal">284</span>
<span class="normal">285</span>
<span class="normal">286</span>
<span class="normal">287</span>
<span class="normal">288</span>
<span class="normal">289</span>
<span class="normal">290</span>
<span class="normal">291</span>
<span class="normal">292</span>
<span class="normal">293</span>
<span class="normal">294</span>
<span class="normal">295</span>
<span class="normal">296</span>
<span class="normal">297</span>
<span class="normal">298</span>
<span class="normal">299</span>
<span class="normal">300</span>
<span class="normal">301</span>
<span class="normal">302</span>
<span class="normal">303</span>
<span class="normal">304</span>
<span class="normal">305</span>
<span class="normal">306</span>
<span class="normal">307</span>
<span class="normal">308</span>
<span class="normal">309</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span><span class="w"> </span><span class="nf">getresponse</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">subdir</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;&quot;</span><span class="p">,</span> <span class="n">use_cache</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Response</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Récupère la réponse d&#39;une page, en utilisant le cache si possible.</span>
<span class="sd"> Args:</span>
<span class="sd"> subdir (str, optional): Le chemin de la page.</span>
<span class="sd"> use_cache (bool, optional): Utilise la donnée deja sauvegarder ou</span>
<span class="sd"> écrase la donnée utilisé avec la nouvelle</span>
<span class="sd"> Returns:</span>
<span class="sd"> Response: L&#39;objet réponse (cache ou nouvelle requête).</span>
<span class="sd"> Raises:</span>
<span class="sd"> HTTPError: Si le serveur renvoie un code d&#39;erreur (4xx, 5xx).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># si dans le cache, latest_request existe</span>
<span class="k">if</span> <span class="n">use_cache</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">_latest_request</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">rq_subdir</span><span class="p">,</span> <span class="n">rq_response</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_latest_request</span>
<span class="c1"># si c&#39;est la meme requete et que use_cache est true,</span>
<span class="c1"># on renvoie celle enregistrer</span>
<span class="k">if</span> <span class="n">subdir</span> <span class="o">==</span> <span class="n">rq_subdir</span><span class="p">:</span>
<span class="k">return</span> <span class="n">rq_response</span>
<span class="n">request</span><span class="p">:</span> <span class="n">Response</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_request</span><span class="p">(</span><span class="n">subdir</span><span class="p">)</span>
<span class="c1"># on recrée la structure pour le systeme de cache si activer</span>
<span class="k">if</span> <span class="n">use_cache</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_latest_request</span> <span class="o">=</span> <span class="p">(</span><span class="n">subdir</span><span class="p">,</span> <span class="n">request</span><span class="p">)</span>
<span class="k">return</span> <span class="n">request</span>
</code></pre></div></td></tr></table></div>
</details>
</div>
</div>
<div class="doc doc-object doc-function">
<h2 id="scraper.Scraper.getsoup" class="doc doc-heading">
<code class="highlight language-python"><span class="n">getsoup</span><span class="p">(</span><span class="n">subdir</span><span class="p">,</span> <span class="n">use_cache</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></code>
</h2>
<div class="doc doc-contents ">
<p>Récupère le contenu HTML d'une page et le transforme en objet BeautifulSoup.</p>
<p><span class="doc-section-title">Parameters:</span></p>
<table>
<thead>
<tr>
<th>Name</th>
<th>Type</th>
<th>Description</th>
<th>Default</th>
</tr>
</thead>
<tbody>
<tr class="doc-section-item">
<td>
<code>subdir</code>
</td>
<td>
<code><span title="str">str</span></code>
</td>
<td>
<div class="doc-md-description">
<p>Le chemin de la page.</p>
</div>
</td>
<td>
<em>required</em>
</td>
</tr>
</tbody>
</table>
<p><span class="doc-section-title">Returns:</span></p>
<table>
<thead>
<tr>
<th>Name</th> <th>Type</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr class="doc-section-item">
<td><code>BeautifulSoup</code></td> <td>
<code><span title="bs4.BeautifulSoup">BeautifulSoup</span></code>
</td>
<td>
<div class="doc-md-description">
<p>L'objet parsé pour extraction de données.</p>
</div>
</td>
</tr>
</tbody>
</table>
<p><span class="doc-section-title">Raises:</span></p>
<table>
<thead>
<tr>
<th>Type</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr class="doc-section-item">
<td>
<code><span title="requests.HTTPError">HTTPError</span></code>
</td>
<td>
<div class="doc-md-description">
<p>Si le serveur renvoie un code d'erreur (4xx, 5xx).</p>
</div>
</td>
</tr>
</tbody>
</table>
<details class="mkdocstrings-source">
<summary>Source code in <code>site-packages/scraper.py</code></summary>
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">311</span>
<span class="normal">312</span>
<span class="normal">313</span>
<span class="normal">314</span>
<span class="normal">315</span>
<span class="normal">316</span>
<span class="normal">317</span>
<span class="normal">318</span>
<span class="normal">319</span>
<span class="normal">320</span>
<span class="normal">321</span>
<span class="normal">322</span>
<span class="normal">323</span>
<span class="normal">324</span>
<span class="normal">325</span>
<span class="normal">326</span>
<span class="normal">327</span>
<span class="normal">328</span>
<span class="normal">329</span>
<span class="normal">330</span>
<span class="normal">331</span>
<span class="normal">332</span>
<span class="normal">333</span>
<span class="normal">334</span>
<span class="normal">335</span>
<span class="normal">336</span>
<span class="normal">337</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span><span class="w"> </span><span class="nf">getsoup</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">subdir</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">use_cache</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">BeautifulSoup</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Récupère le contenu HTML d&#39;une page et le transforme en objet BeautifulSoup.</span>
<span class="sd"> Args:</span>
<span class="sd"> subdir (str, optional): Le chemin de la page.</span>
<span class="sd"> Returns:</span>
<span class="sd"> BeautifulSoup: L&#39;objet parsé pour extraction de données.</span>
<span class="sd"> Raises:</span>
<span class="sd"> HTTPError: Si le serveur renvoie un code d&#39;erreur (4xx, 5xx).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">use_cache</span> <span class="ow">and</span> <span class="n">subdir</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_latest_soups</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_latest_soups</span><span class="p">[</span><span class="n">subdir</span><span class="p">]</span>
<span class="n">markup</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">getresponse</span><span class="p">(</span><span class="n">subdir</span><span class="p">)</span><span class="o">.</span><span class="n">text</span>
<span class="n">soup</span><span class="p">:</span> <span class="n">BeautifulSoup</span> <span class="o">=</span> <span class="n">BeautifulSoup</span><span class="p">(</span><span class="n">markup</span><span class="p">,</span> <span class="n">features</span><span class="o">=</span><span class="s2">&quot;html.parser&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">use_cache</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_latest_soups</span><span class="p">[</span><span class="n">subdir</span><span class="p">]</span> <span class="o">=</span> <span class="n">soup</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_latest_soups</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">10</span><span class="p">:</span>
<span class="n">_</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_latest_soups</span><span class="o">.</span><span class="n">popitem</span><span class="p">(</span><span class="n">last</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="k">return</span> <span class="n">soup</span>
</code></pre></div></td></tr></table></div>
</details>
</div>
</div>
<div class="doc doc-object doc-function">
<h2 id="scraper.Scraper.getvins" class="doc doc-heading">
<code class="highlight language-python"><span class="n">getvins</span><span class="p">(</span><span class="n">subdir</span><span class="p">,</span> <span class="n">filename</span><span class="p">,</span> <span class="n">reset</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></code>
</h2>
<div class="doc doc-contents ">
<p>Scrape toutes les pages d'une catégorie et sauvegarde en CSV.</p>
<p><span class="doc-section-title">Parameters:</span></p>
<table>
<thead>
<tr>
<th>Name</th>
<th>Type</th>
<th>Description</th>
<th>Default</th>
</tr>
</thead>
<tbody>
<tr class="doc-section-item">
<td>
<code>subdir</code>
</td>
<td>
<code><span title="str">str</span></code>
</td>
<td>
<div class="doc-md-description">
<p>La catégorie (ex: '/vins-rouges').</p>
</div>
</td>
<td>
<em>required</em>
</td>
</tr>
<tr class="doc-section-item">
<td>
<code>filename</code>
</td>
<td>
<code><span title="str">str</span></code>
</td>
<td>
<div class="doc-md-description">
<p>Nom du fichier de sortie (ex: 'vins.csv').</p>
</div>
</td>
<td>
<em>required</em>
</td>
</tr>
<tr class="doc-section-item">
<td>
<code>reset</code>
</td>
<td>
<code><span title="bool">bool</span></code>
</td>
<td>
<div class="doc-md-description">
<p>(Optionnel) pour réinitialiser le processus.</p>
</div>
</td>
<td>
<code>False</code>
</td>
</tr>
</tbody>
</table>
<details class="mkdocstrings-source">
<summary>Source code in <code>site-packages/scraper.py</code></summary>
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">410</span>
<span class="normal">411</span>
<span class="normal">412</span>
<span class="normal">413</span>
<span class="normal">414</span>
<span class="normal">415</span>
<span class="normal">416</span>
<span class="normal">417</span>
<span class="normal">418</span>
<span class="normal">419</span>
<span class="normal">420</span>
<span class="normal">421</span>
<span class="normal">422</span>
<span class="normal">423</span>
<span class="normal">424</span>
<span class="normal">425</span>
<span class="normal">426</span>
<span class="normal">427</span>
<span class="normal">428</span>
<span class="normal">429</span>
<span class="normal">430</span>
<span class="normal">431</span>
<span class="normal">432</span>
<span class="normal">433</span>
<span class="normal">434</span>
<span class="normal">435</span>
<span class="normal">436</span>
<span class="normal">437</span>
<span class="normal">438</span>
<span class="normal">439</span>
<span class="normal">440</span>
<span class="normal">441</span>
<span class="normal">442</span>
<span class="normal">443</span>
<span class="normal">444</span>
<span class="normal">445</span>
<span class="normal">446</span>
<span class="normal">447</span>
<span class="normal">448</span>
<span class="normal">449</span>
<span class="normal">450</span>
<span class="normal">451</span>
<span class="normal">452</span>
<span class="normal">453</span>
<span class="normal">454</span>
<span class="normal">455</span>
<span class="normal">456</span>
<span class="normal">457</span>
<span class="normal">458</span>
<span class="normal">459</span>
<span class="normal">460</span>
<span class="normal">461</span>
<span class="normal">462</span>
<span class="normal">463</span>
<span class="normal">464</span>
<span class="normal">465</span>
<span class="normal">466</span>
<span class="normal">467</span>
<span class="normal">468</span>
<span class="normal">469</span>
<span class="normal">470</span>
<span class="normal">471</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span><span class="w"> </span><span class="nf">getvins</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">subdir</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">filename</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">reset</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Scrape toutes les pages d&#39;une catégorie et sauvegarde en CSV.</span>
<span class="sd"> Args:</span>
<span class="sd"> subdir (str): La catégorie (ex: &#39;/vins-rouges&#39;).</span>
<span class="sd"> filename (str): Nom du fichier de sortie (ex: &#39;vins.csv&#39;).</span>
<span class="sd"> reset (bool): (Optionnel) pour réinitialiser le processus.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># mode d&#39;écriture fichier</span>
<span class="n">mode</span><span class="p">:</span> <span class="n">Literal</span><span class="p">[</span><span class="s2">&quot;w&quot;</span><span class="p">,</span> <span class="s2">&quot;a+&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;w&quot;</span> <span class="k">if</span> <span class="n">reset</span> <span class="k">else</span> <span class="s2">&quot;a+&quot;</span>
<span class="c1"># titre</span>
<span class="n">title</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;Appellation,Robert,Robinson,Suckling,Prix</span><span class="se">\n</span><span class="s2">&quot;</span>
<span class="c1"># page du début</span>
<span class="n">page</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span>
<span class="c1"># le set qui sert de cache</span>
<span class="n">cache</span><span class="p">:</span> <span class="nb">set</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="nb">set</span><span class="p">[</span><span class="nb">str</span><span class="p">]()</span>
<span class="n">custom_format</span> <span class="o">=</span> <span class="s2">&quot;</span><span class="si">{l_bar}</span><span class="s2"> </span><span class="si">{bar:20}</span><span class="s2"> </span><span class="si">{r_bar}</span><span class="s2">&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">reset</span><span class="p">:</span>
<span class="c1"># appelle la fonction pour load le cache, si il existe</span>
<span class="c1"># pas, il utilise les variables de base sinon il override</span>
<span class="c1"># toute les variables pour continuer et pas recommencer le</span>
<span class="c1"># processus en entier.</span>
<span class="n">serializable</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">set</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="n">loadstate</span><span class="p">()</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">serializable</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span>
<span class="c1"># override la page et le cache</span>
<span class="n">page</span><span class="p">,</span> <span class="n">cache</span> <span class="o">=</span> <span class="n">serializable</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">filename</span><span class="p">,</span> <span class="n">mode</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
<span class="c1"># check si le titre est bien présent au début du buffer</span>
<span class="c1"># sinon il l&#39;ecrit, petit bug potentiel, a+ ecrit tout le</span>
<span class="c1"># temps a la fin du buffer, si on a ecrit des choses avant</span>
<span class="c1"># le titre sera apres ces données mais on part du principe</span>
<span class="c1"># que personne va toucher le fichier.</span>
<span class="n">_</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">seek</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">SEEK_SET</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="p">(</span><span class="n">f</span><span class="o">.</span><span class="n">read</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">title</span><span class="p">))</span> <span class="o">==</span> <span class="n">title</span><span class="p">):</span>
<span class="n">_</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">title</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">_</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">seek</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">SEEK_END</span><span class="p">)</span>
<span class="k">while</span> <span class="kc">True</span><span class="p">:</span>
<span class="n">products_list</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]</span> <span class="o">|</span> <span class="kc">None</span> <span class="o">=</span> <span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_geturlproductslist</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">subdir</span><span class="si">}</span><span class="s2">?page=</span><span class="si">{</span><span class="n">page</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">products_list</span><span class="p">:</span>
<span class="k">break</span>
<span class="n">pbar</span><span class="p">:</span> <span class="n">tqdm</span><span class="p">[</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]</span> <span class="o">=</span> <span class="n">tqdm</span><span class="p">(</span>
<span class="n">products_list</span><span class="p">,</span> <span class="n">bar_format</span><span class="o">=</span><span class="n">custom_format</span>
<span class="p">)</span>
<span class="k">for</span> <span class="n">product</span> <span class="ow">in</span> <span class="n">pbar</span><span class="p">:</span>
<span class="n">keyword</span> <span class="o">=</span> <span class="n">product</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;seoKeyword&quot;</span><span class="p">,</span> <span class="s2">&quot;Inconnu&quot;</span><span class="p">)[:</span><span class="mi">40</span><span class="p">]</span>
<span class="n">pbar</span><span class="o">.</span><span class="n">set_description</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;Page: </span><span class="si">{</span><span class="n">page</span><span class="si">:</span><span class="s2">&lt;3</span><span class="si">}</span><span class="s2"> | Product: </span><span class="si">{</span><span class="n">keyword</span><span class="si">:</span><span class="s2">&lt;40</span><span class="si">}</span><span class="s2">&quot;</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_writevins</span><span class="p">(</span><span class="n">cache</span><span class="p">,</span> <span class="n">product</span><span class="p">,</span> <span class="n">f</span><span class="p">)</span>
<span class="n">page</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="k">except</span> <span class="p">(</span><span class="ne">Exception</span><span class="p">,</span> <span class="n">HTTPError</span><span class="p">,</span> <span class="ne">KeyboardInterrupt</span><span class="p">,</span> <span class="n">JSONDecodeError</span><span class="p">):</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">reset</span><span class="p">:</span>
<span class="n">savestate</span><span class="p">((</span><span class="n">page</span><span class="p">,</span> <span class="n">cache</span><span class="p">))</span>
</code></pre></div></td></tr></table></div>
</details>
</div>
</div>
</div>
</div>
</div>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"base": "..", "features": [], "search": "../assets/javascripts/workers/search.973d3a69.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../assets/javascripts/bundle.f55a23d4.min.js"></script>
</body>
</html>