From 4faa6ef364fb068cf160c9b04626c9614e9fed7f Mon Sep 17 00:00:00 2001 From: Tykayn Date: Wed, 12 Jan 2022 00:29:01 +0100 Subject: [PATCH] gather links --- scrapers/ccpl.ts | 47 +++++++++++++++++++++-------- sources_examples/ccpl_scrapped.json | 42 ++++++++++++++++++++++++-- 2 files changed, 73 insertions(+), 16 deletions(-) diff --git a/scrapers/ccpl.ts b/scrapers/ccpl.ts index a1d2805..f6ea8f5 100644 --- a/scrapers/ccpl.ts +++ b/scrapers/ccpl.ts @@ -29,7 +29,11 @@ let scrappedData: any = { }; options = {headless: false, devtools: true} - +/** + * fetch CCPL agenda + * find all links to events details + * scrap data on each event page + */ async function run() { const browser = await puppeteer.launch(options); @@ -40,26 +44,35 @@ async function run() { // titre: #widgit_event_details .widgit_title - let scrappedData = await page.evaluate(() => { + let dataRun = await page.evaluate(() => { let sel = '#widgit_results_agenda .widgit_result .title'; - let listOfElements: any = document.querySelectorAll(sel); - let linkTitleEvent: any = []; + let listOfElements: any = document.querySelectorAll('#widgit_results_agenda .widgit_result .title'); + let listOfElementsLinks: any = document.querySelectorAll('#widgit_results_agenda a'); + + let hrefsDetails: any = []; let titleList: any = []; + if (listOfElementsLinks.length) { + listOfElementsLinks.forEach((elem: any) => { + hrefsDetails.push(elem.getAttribute('data-w-href')); + }) + } if (listOfElements.length) { listOfElements.forEach((elem: any) => { console.log('title', elem.innerHTML); titleList.push(elem.innerHTML); - linkTitleEvent.push(elem.attribute['href']); }) } + + const scrapped: any = { selector: sel, titleList, - linkTitleEvent + hrefsDetails, + listOfElementsLinks }; console.log('titleList', titleList); @@ -67,13 +80,21 @@ async function run() { }); - // scrappedData.linkTitleEvent.forEach((url: string) => { - // // add delay - // getEventPageInfo(url) - // }) + let ii=0; + dataRun.listOfElementsLinks.forEach((url: string) => { + // add delay + console.log('url', url); + // if(ii<3){ + // getEventPageInfo(url) + // + // } + ii++; + }) - browser.close(); + // browser.close(); console.log('DONE'); + saveScrappeddata({scrappedData, ...dataRun}); + return dataRun; } async function getEventPageInfo(url: string) { @@ -117,7 +138,7 @@ async function getEventPageInfo(url: string) { } -run(); +scrappedData = run(); async function getOnePage() { @@ -130,4 +151,4 @@ async function getOnePage() { } // getOnePage() -saveScrappeddata(scrappedData); \ No newline at end of file +// saveScrappeddata(scrappedData); \ No newline at end of file diff --git a/sources_examples/ccpl_scrapped.json b/sources_examples/ccpl_scrapped.json index 2ab48a6..c100080 100644 --- a/sources_examples/ccpl_scrapped.json +++ b/sources_examples/ccpl_scrapped.json @@ -1,5 +1,41 @@ { - "pages": [], - "titleList": null, - "linkTitleEvent": null + "scrappedData": {}, + "selector": "#widgit_results_agenda .widgit_result .title", + "titleList": [ + "Soirée jeux", + "Conférence Patrimoine : les cloches de l'église des Molières", + "Contes en famille", + "Les Hivernales, spectacle Hip Hop", + "Théâtre - À cause des garçons", + "Soirée jeux", + "Soirée Jazz Blues Latino", + "Soirée jeux", + "Fête des Plantes de Printemps du Domaine de Saint-Jean de Beauregard", + "Fête des Plantes de Printemps du Domaine de Saint-Jean de Beauregard", + "Fête des Plantes de Printemps du Domaine de Saint-Jean de Beauregard", + "Soirée jeux", + "Fête de la création et des métiers d'art", + "Fête de la création et des métiers d'art", + "Fête de la création et des métiers d'art", + "Soirée jeux" + ], + "hrefsDetails": [ + null, + "https://widgets.apidae-tourisme.com/details/247.js?apidae_id=5997388", + "https://widgets.apidae-tourisme.com/details/247.js?apidae_id=5997492", + "https://widgets.apidae-tourisme.com/details/247.js?apidae_id=6002033", + "https://widgets.apidae-tourisme.com/details/247.js?apidae_id=5995489", + "https://widgets.apidae-tourisme.com/details/247.js?apidae_id=6002454", + "https://widgets.apidae-tourisme.com/details/247.js?apidae_id=5997388", + "https://widgets.apidae-tourisme.com/details/247.js?apidae_id=6001129", + "https://widgets.apidae-tourisme.com/details/247.js?apidae_id=5997388", + "https://widgets.apidae-tourisme.com/details/247.js?apidae_id=4718535", + "https://widgets.apidae-tourisme.com/details/247.js?apidae_id=4718535", + "https://widgets.apidae-tourisme.com/details/247.js?apidae_id=4718535", + "https://widgets.apidae-tourisme.com/details/247.js?apidae_id=5997388", + "https://widgets.apidae-tourisme.com/details/247.js?apidae_id=4747897", + "https://widgets.apidae-tourisme.com/details/247.js?apidae_id=4747897", + "https://widgets.apidae-tourisme.com/details/247.js?apidae_id=4747897", + "https://widgets.apidae-tourisme.com/details/247.js?apidae_id=5997388" + ] } \ No newline at end of file