154 lines
4.4 KiB
TypeScript
154 lines
4.4 KiB
TypeScript
const puppeteer = require('puppeteer');
|
|
import * as fs from 'fs';
|
|
import parserConfig from "../config";
|
|
// get data from webpage
|
|
|
|
let options: any = {};
|
|
let scrappedData: any = {
|
|
pages: [],
|
|
titleList: null,
|
|
linkTitleEvent: null
|
|
};
|
|
options = {headless: false, devtools: true}
|
|
|
|
/**
|
|
* fetch CCPL agenda
|
|
* find all links to events details
|
|
* scrap data on each event page
|
|
*/
|
|
async function run() {
|
|
|
|
const browser = await puppeteer.launch(options);
|
|
const page = await browser.newPage();
|
|
await page.goto('https://www.cc-paysdelimours.fr/agenda');
|
|
|
|
// chaque lien d'évènement est un .widgit_result
|
|
// titre: #widgit_event_details .widgit_title
|
|
|
|
|
|
let dataRun = await page.evaluate(() => {
|
|
|
|
|
|
let sel = '#widgit_results_agenda .widgit_result .title';
|
|
let listOfElements: any = document.querySelectorAll('#widgit_results_agenda .widgit_result .title');
|
|
let listOfElementsLinks: any = document.querySelectorAll('#widgit_results_agenda a');
|
|
|
|
let hrefsDetails: any = [];
|
|
let titleList: any = [];
|
|
|
|
|
|
if (listOfElementsLinks.length) {
|
|
listOfElementsLinks.forEach((elem: any) => {
|
|
hrefsDetails.push(elem.getAttribute('data-w-href'));
|
|
})
|
|
}
|
|
if (listOfElements.length) {
|
|
listOfElements.forEach((elem: any) => {
|
|
console.log('title', elem.innerHTML);
|
|
titleList.push(elem.innerHTML);
|
|
})
|
|
}
|
|
|
|
|
|
const scrapped: any = {
|
|
selector: sel,
|
|
titleList,
|
|
hrefsDetails,
|
|
listOfElementsLinks
|
|
};
|
|
|
|
console.log('titleList', titleList);
|
|
return scrapped;
|
|
});
|
|
|
|
|
|
let ii = 0;
|
|
|
|
|
|
dataRun.listOfElementsLinks.forEach((url: string) => {
|
|
// add delay
|
|
console.log('url', url);
|
|
if (ii < 3) {
|
|
let eventInfo = getEventPageInfo(url)
|
|
scrappedData.pages.push(eventInfo);
|
|
}
|
|
ii++;
|
|
})
|
|
|
|
|
|
console.log('DONE');
|
|
let data = {scrappedData, ...dataRun};
|
|
let fileName = 'ccpl_scrapped.json';
|
|
|
|
fs.writeFile(
|
|
`./sources_examples/${fileName}`,
|
|
JSON.stringify(data, null, 4),
|
|
"utf8",
|
|
(err: any) => {
|
|
if (err) {
|
|
console.log(`Error writing file: ${err}`);
|
|
} else {
|
|
console.log(`File ${fileName} is written successfully!`);
|
|
}
|
|
}
|
|
);
|
|
|
|
browser.close();
|
|
return dataRun;
|
|
}
|
|
|
|
async function getEventPageInfo(url: string):Promise<any> {
|
|
const browser = await puppeteer.launch(options);
|
|
const page = await browser.newPage();
|
|
await page.goto(url);
|
|
let eventInfo = await page.evaluate(() => {
|
|
let description = '';
|
|
document.querySelectorAll('.desc').forEach((element: any) => {
|
|
description += element.innerHTML;
|
|
})
|
|
let websiteNode: any = document.querySelector('.website');
|
|
let mapNode: any = document.querySelector('#details_map');
|
|
let filesNode: any = document.querySelector('.attachments a');
|
|
let addressParagraphs: any = document.querySelectorAll('.contact p');
|
|
// @ts-ignore
|
|
let eventInfo: any = {
|
|
title: document.querySelector('.widgit_title').innerHTML,
|
|
description,
|
|
website: websiteNode.getAttribute('href'),
|
|
map_latitude: mapNode.getAttribute('data-lat')
|
|
,
|
|
map_longitude: mapNode.getAttribute('data-lng'),
|
|
files: filesNode.getAttribute('href'),
|
|
contact_all: document.querySelector('.contact').innerHTML,
|
|
address: addressParagraphs[0]?.innerText + addressParagraphs[1]?.innerText + addressParagraphs[2]?.innerText,
|
|
phone: document.querySelector('.mbvs').innerHTML,
|
|
date: document.querySelector('.openings .pre').innerHTML,
|
|
}
|
|
console.log('eventInfo inside', eventInfo);
|
|
//
|
|
// saveScrappeddata(eventInfo);
|
|
// @ts-ignore
|
|
return eventInfo;
|
|
})
|
|
console.log('eventInfo outside', eventInfo);
|
|
scrappedData.pages.push(eventInfo);
|
|
|
|
browser.close();
|
|
return eventInfo;
|
|
|
|
}
|
|
|
|
scrappedData = run();
|
|
|
|
async function getOnePage() {
|
|
|
|
let eventInfo = await getEventPageInfo('https://www.cc-paysdelimours.fr/agenda#widget-details-offre-4718535');
|
|
|
|
// sauver les contenus dans un fichier json
|
|
console.log('scrappedData outside', scrappedData);
|
|
|
|
|
|
}
|
|
|
|
// getOnePage()
|
|
// saveScrappeddata(scrappedData);
|