un script nodejs pour enrichir une instance mobilizon à partir d'un flux rss présentant des évènements. ce script détecte les évènements déjà existants dans l'instance mobilizon et ne crée que ceux qui n'y sont pas encore présent.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

157 lines
4.5 KiB

const puppeteer = require('puppeteer');
import * as fs from 'fs';
import parserConfig from "../config";
// configure database
// get data from webpage
let options: any = {};
let scrappedData: any = {
pages: [],
titleList: null,
linkTitleEvent: null
};
options = {headless: false, devtools: true}
/**
* fetch CCPL agenda
* find all links to events details
* scrap data on each event page
*/
async function run() {
const browser = await puppeteer.launch(options);
const page = await browser.newPage();
await page.goto('https://www.cc-paysdelimours.fr/agenda');
// chaque lien d'évènement est un .widgit_result
// titre: #widgit_event_details .widgit_title
let dataRun = await page.evaluate(() => {
let sel = '#widgit_results_agenda .widgit_result .title';
let listOfElements: any = document.querySelectorAll('#widgit_results_agenda .widgit_result .title');
let listOfElementsLinks: any = document.querySelectorAll('#widgit_results_agenda a');
let hrefsDetails: any = [];
let titleList: any = [];
if (listOfElementsLinks.length) {
listOfElementsLinks.forEach((elem: any) => {
hrefsDetails.push(elem.getAttribute('data-w-href'));
})
}
if (listOfElements.length) {
listOfElements.forEach((elem: any) => {
console.log('title', elem.innerHTML);
titleList.push(elem.innerHTML);
})
}
const scrapped: any = {
selector: sel,
titleList,
hrefsDetails,
listOfElementsLinks
};
console.log('titleList', titleList);
return scrapped;
});
let ii = 0;
dataRun.listOfElementsLinks.forEach((url: string) => {
// add delay
console.log('url', url);
if (ii < 3) {
let eventInfo = getEventPageInfo(url)
scrappedData.pages.push(eventInfo);
}
ii++;
})
console.log('DONE');
let data = {scrappedData, ...dataRun};
let fileName = 'ccpl_scrapped.json';
fs.writeFile(
`./sources_examples/${fileName}`,
JSON.stringify(data, null, 4),
"utf8",
(err: any) => {
if (err) {
console.log(`Error writing file: ${err}`);
} else {
console.log(`File ${fileName} is written successfully!`);
}
}
);
browser.close();
return dataRun;
}
async function getEventPageInfo(url: string):Promise<any> {
const browser = await puppeteer.launch(options);
const page = await browser.newPage();
await page.goto(url);
let eventInfo = await page.evaluate(() => {
let description = '';
document.querySelectorAll('.desc').forEach((element: any) => {
description += element.innerHTML;
})
let websiteNode: any = document.querySelector('.website');
let mapNode: any = document.querySelector('#details_map');
let filesNode: any = document.querySelector('.attachments a');
let addressParagraphs: any = document.querySelectorAll('.contact p');
// @ts-ignore
let eventInfo: any = {
title: document.querySelector('.widgit_title').innerHTML,
description,
website: websiteNode.getAttribute('href'),
map_latitude: mapNode.getAttribute('data-lat')
,
map_longitude: mapNode.getAttribute('data-lng'),
files: filesNode.getAttribute('href'),
contact_all: document.querySelector('.contact').innerHTML,
address: addressParagraphs[0]?.innerText + addressParagraphs[1]?.innerText + addressParagraphs[2]?.innerText,
phone: document.querySelector('.mbvs').innerHTML,
date: document.querySelector('.openings .pre').innerHTML,
}
console.log('eventInfo inside', eventInfo);
//
// saveScrappeddata(eventInfo);
// @ts-ignore
return eventInfo;
})
console.log('eventInfo outside', eventInfo);
scrappedData.pages.push(eventInfo);
browser.close();
return eventInfo;
}
scrappedData = run();
async function getOnePage() {
let eventInfo = await getEventPageInfo('https://www.cc-paysdelimours.fr/agenda#widget-details-offre-4718535');
// sauver les contenus dans un fichier json
console.log('scrappedData outside', scrappedData);
}
// getOnePage()
// saveScrappeddata(scrappedData);