rss-feeder-mobilizon/scrapers/ccpl.ts

154 lines
4.4 KiB
TypeScript

const puppeteer = require('puppeteer');
import * as fs from 'fs';
// configure database
// get data from webpage
function saveScrappeddata(data: any) {
let fileName = 'ccpl_scrapped.json';
fs.writeFile(
`./sources_examples/${fileName}`,
JSON.stringify(data, null, 4),
"utf8",
(err: any) => {
if (err) {
console.log(`Error writing file: ${err}`);
} else {
console.log(`File ${fileName} is written successfully!`);
}
}
);
}
let options: any = {};
let scrappedData: any = {
pages: [],
titleList: null,
linkTitleEvent: null
};
options = {headless: false, devtools: true}
/**
* fetch CCPL agenda
* find all links to events details
* scrap data on each event page
*/
async function run() {
const browser = await puppeteer.launch(options);
const page = await browser.newPage();
await page.goto('https://www.cc-paysdelimours.fr/agenda');
// chaque lien d'évènement est un .widgit_result
// titre: #widgit_event_details .widgit_title
let dataRun = await page.evaluate(() => {
let sel = '#widgit_results_agenda .widgit_result .title';
let listOfElements: any = document.querySelectorAll('#widgit_results_agenda .widgit_result .title');
let listOfElementsLinks: any = document.querySelectorAll('#widgit_results_agenda a');
let hrefsDetails: any = [];
let titleList: any = [];
if (listOfElementsLinks.length) {
listOfElementsLinks.forEach((elem: any) => {
hrefsDetails.push(elem.getAttribute('data-w-href'));
})
}
if (listOfElements.length) {
listOfElements.forEach((elem: any) => {
console.log('title', elem.innerHTML);
titleList.push(elem.innerHTML);
})
}
const scrapped: any = {
selector: sel,
titleList,
hrefsDetails,
listOfElementsLinks
};
console.log('titleList', titleList);
return scrapped;
});
let ii=0;
dataRun.listOfElementsLinks.forEach((url: string) => {
// add delay
console.log('url', url);
// if(ii<3){
// getEventPageInfo(url)
//
// }
ii++;
})
// browser.close();
console.log('DONE');
saveScrappeddata({scrappedData, ...dataRun});
return dataRun;
}
async function getEventPageInfo(url: string) {
const browser = await puppeteer.launch(options);
const page = await browser.newPage();
await page.goto(url);
let eventInfo = await page.evaluate(() => {
let description = '';
document.querySelectorAll('.desc').forEach((element: any) => {
description += element.innerHTML;
})
let websiteNode: any = document.querySelector('.website');
let mapNode: any = document.querySelector('#details_map');
let filesNode: any = document.querySelector('.attachments a');
let addressParagraphs: any = document.querySelectorAll('.contact p');
// @ts-ignore
let eventInfo: any = {
title: document.querySelector('.widgit_title').innerHTML,
description,
website: websiteNode.getAttribute('href'),
map_latitude: mapNode.getAttribute('data-lat')
,
map_longitude: mapNode.getAttribute('data-lng'),
files: filesNode.getAttribute('href'),
contact_all: document.querySelector('.contact').innerHTML,
address: addressParagraphs[0]?.innerText + addressParagraphs[1]?.innerText + addressParagraphs[2]?.innerText,
phone: document.querySelector('.mbvs').innerHTML,
date: document.querySelector('.openings .pre').innerHTML,
}
console.log('eventInfo inside', eventInfo);
//
// saveScrappeddata(eventInfo);
// @ts-ignore
return eventInfo;
})
console.log('eventInfo outside', eventInfo);
scrappedData.pages.push(eventInfo);
browser.close();
return eventInfo;
}
scrappedData = run();
async function getOnePage() {
let eventInfo = await getEventPageInfo('https://www.cc-paysdelimours.fr/agenda#widget-details-offre-4718535');
// sauver les contenus dans un fichier json
console.log('scrappedData outside', scrappedData);
}
// getOnePage()
// saveScrappeddata(scrappedData);