const puppeteer = require('puppeteer'); import * as fs from 'fs'; import parserConfig from "../config"; // get data from webpage let options: any = {}; let scrappedData: any = { pages: [], titleList: null, linkTitleEvent: null }; options = {headless: false, devtools: true} /** * fetch CCPL agenda * find all links to events details * scrap data on each event page */ async function run() { const browser = await puppeteer.launch(options); const page = await browser.newPage(); await page.goto('https://www.cc-paysdelimours.fr/agenda'); // chaque lien d'évènement est un .widgit_result // titre: #widgit_event_details .widgit_title let dataRun = await page.evaluate(() => { let sel = '#widgit_results_agenda .widgit_result .title'; let listOfElements: any = document.querySelectorAll('#widgit_results_agenda .widgit_result .title'); let listOfElementsLinks: any = document.querySelectorAll('#widgit_results_agenda a'); let hrefsDetails: any = []; let titleList: any = []; if (listOfElementsLinks.length) { listOfElementsLinks.forEach((elem: any) => { hrefsDetails.push(elem.getAttribute('data-w-href')); }) } if (listOfElements.length) { listOfElements.forEach((elem: any) => { console.log('title', elem.innerHTML); titleList.push(elem.innerHTML); }) } const scrapped: any = { selector: sel, titleList, hrefsDetails, listOfElementsLinks }; console.log('titleList', titleList); return scrapped; }); let ii = 0; dataRun.listOfElementsLinks.forEach((url: string) => { // add delay console.log('url', url); if (ii < 3) { let eventInfo = getEventPageInfo(url) scrappedData.pages.push(eventInfo); } ii++; }) console.log('DONE'); let data = {scrappedData, ...dataRun}; let fileName = 'ccpl_scrapped.json'; fs.writeFile( `./sources_examples/${fileName}`, JSON.stringify(data, null, 4), "utf8", (err: any) => { if (err) { console.log(`Error writing file: ${err}`); } else { console.log(`File ${fileName} is written successfully!`); } } ); browser.close(); return dataRun; } async function getEventPageInfo(url: string):Promise { const browser = await puppeteer.launch(options); const page = await browser.newPage(); await page.goto(url); let eventInfo = await page.evaluate(() => { let description = ''; document.querySelectorAll('.desc').forEach((element: any) => { description += element.innerHTML; }) let websiteNode: any = document.querySelector('.website'); let mapNode: any = document.querySelector('#details_map'); let filesNode: any = document.querySelector('.attachments a'); let addressParagraphs: any = document.querySelectorAll('.contact p'); // @ts-ignore let eventInfo: any = { title: document.querySelector('.widgit_title').innerHTML, description, website: websiteNode.getAttribute('href'), map_latitude: mapNode.getAttribute('data-lat') , map_longitude: mapNode.getAttribute('data-lng'), files: filesNode.getAttribute('href'), contact_all: document.querySelector('.contact').innerHTML, address: addressParagraphs[0]?.innerText + addressParagraphs[1]?.innerText + addressParagraphs[2]?.innerText, phone: document.querySelector('.mbvs').innerHTML, date: document.querySelector('.openings .pre').innerHTML, } console.log('eventInfo inside', eventInfo); // // saveScrappeddata(eventInfo); // @ts-ignore return eventInfo; }) console.log('eventInfo outside', eventInfo); scrappedData.pages.push(eventInfo); browser.close(); return eventInfo; } scrappedData = run(); async function getOnePage() { let eventInfo = await getEventPageInfo('https://www.cc-paysdelimours.fr/agenda#widget-details-offre-4718535'); // sauver les contenus dans un fichier json console.log('scrappedData outside', scrappedData); } // getOnePage() // saveScrappeddata(scrappedData);