From 04f561a9d34bf75aa195cfb323cf2197dc8e3248 Mon Sep 17 00:00:00 2001 From: Tykayn Date: Wed, 12 Jan 2022 00:08:20 +0100 Subject: [PATCH] get one page data --- scrapers/ccpl.ts | 135 +++++++++++++++++++--------- sources_examples/ccpl_scrapped.json | 5 ++ 2 files changed, 98 insertions(+), 42 deletions(-) create mode 100644 sources_examples/ccpl_scrapped.json diff --git a/scrapers/ccpl.ts b/scrapers/ccpl.ts index bed04ae..a1d2805 100644 --- a/scrapers/ccpl.ts +++ b/scrapers/ccpl.ts @@ -2,31 +2,50 @@ const puppeteer = require('puppeteer'); import * as fs from 'fs'; // configure database -const mongoose = require('mongoose'); -const EventMobilizon = require('./../models/Event'); // get data from webpage +function saveScrappeddata(data: any) { + let fileName = 'ccpl_scrapped.json'; + + fs.writeFile( + `./sources_examples/${fileName}`, + JSON.stringify(data, null, 4), + "utf8", + (err: any) => { + if (err) { + console.log(`Error writing file: ${err}`); + } else { + console.log(`File ${fileName} is written successfully!`); + } + } + ); +} + +let options: any = {}; +let scrappedData: any = { + pages: [], + titleList: null, + linkTitleEvent: null +}; +options = {headless: false, devtools: true} + async function run() { - let options = {}; - options = {headless: false, devtools: true } const browser = await puppeteer.launch(options); const page = await browser.newPage(); - - await page.goto('https://www.cc-paysdelimours.fr/agenda'); // chaque lien d'évènement est un .widgit_result - // titre: .widgit_result .widgit_title // titre: #widgit_event_details .widgit_title - await page.evaluate(() => { - const scrapped: any = {}; - let sel = '#widgit_results_agenda .widgit_result .title'; + let scrappedData = await page.evaluate(() => { + + let sel = '#widgit_results_agenda .widgit_result .title'; let listOfElements: any = document.querySelectorAll(sel); + let linkTitleEvent: any = []; let titleList: any = []; @@ -34,49 +53,81 @@ async function run() { listOfElements.forEach((elem: any) => { console.log('title', elem.innerHTML); titleList.push(elem.innerHTML); + linkTitleEvent.push(elem.attribute['href']); }) } - scrapped.selector = sel; - scrapped.titleList = titleList; - // sauver les contenus dans un fichier json - let fileName = 'ccpl_scrapped.json'; + const scrapped: any = { + selector: sel, + titleList, + linkTitleEvent + }; - fs.writeFile( - `./sources_examples/${fileName}`, - JSON.stringify(scrapped, null, 4), - "utf8", - (err: any) => { - if (err) { - console.log(`Error writing file: ${err}`); - } else { - console.log(`File ${fileName} is written successfully!`); - } - } - ); console.log('titleList', titleList); - return titleList; + return scrapped; }); - // browser.close(); + + + // scrappedData.linkTitleEvent.forEach((url: string) => { + // // add delay + // getEventPageInfo(url) + // }) + + browser.close(); console.log('DONE'); } -function upsertEventMobilizon(eventObject:any) { +async function getEventPageInfo(url: string) { + const browser = await puppeteer.launch(options); + const page = await browser.newPage(); + await page.goto(url); + let eventInfo = await page.evaluate(() => { + let description = ''; + document.querySelectorAll('.desc').forEach((element: any) => { + description += element.innerHTML; + }) + let websiteNode: any = document.querySelector('.website'); + let mapNode: any = document.querySelector('#details_map'); + let filesNode: any = document.querySelector('.attachments a'); + let addressParagraphs: any = document.querySelectorAll('.contact p'); + // @ts-ignore + let eventInfo: any = { + title: document.querySelector('.widgit_title').innerHTML, + description, + website: websiteNode.getAttribute('href'), + map_latitude: mapNode.getAttribute('data-lat') + , + map_longitude: mapNode.getAttribute('data-lng'), + files: filesNode.getAttribute('href'), + contact_all: document.querySelector('.contact').innerHTML, + address: addressParagraphs[0]?.innerText + addressParagraphs[1]?.innerText + addressParagraphs[2]?.innerText, + phone: document.querySelector('.mbvs').innerHTML, + date: document.querySelector('.openings .pre').innerHTML, + } + console.log('eventInfo inside', eventInfo); + // + // saveScrappeddata(eventInfo); + // @ts-ignore + return eventInfo; + }) + console.log('eventInfo outside', eventInfo); + scrappedData.pages.push(eventInfo); - const DB_URL = 'mongodb://localhost/thal'; + browser.close(); + return eventInfo; - if (mongoose.connection.readyState == 0) { mongoose.connect(DB_URL); } - - // if this email exists, update the entry, don't insert - let conditions = { title: eventObject.title }; - let options = { upsert: true, new: true, setDefaultsOnInsert: true }; - - EventMobilizon.findOneAndUpdate(conditions, eventObject, options, (err:any, result:any) => { - if (err){ - console.log('result,err', result,err); - throw err; - } - }); } + run(); +async function getOnePage() { + let eventInfo = await getEventPageInfo('https://www.cc-paysdelimours.fr/agenda#widget-details-offre-4718535'); + +// sauver les contenus dans un fichier json + console.log('scrappedData outside', scrappedData); + + +} + +// getOnePage() +saveScrappeddata(scrappedData); \ No newline at end of file diff --git a/sources_examples/ccpl_scrapped.json b/sources_examples/ccpl_scrapped.json new file mode 100644 index 0000000..2ab48a6 --- /dev/null +++ b/sources_examples/ccpl_scrapped.json @@ -0,0 +1,5 @@ +{ + "pages": [], + "titleList": null, + "linkTitleEvent": null +} \ No newline at end of file