get one page data
This commit is contained in:
parent
055a811441
commit
04f561a9d3
157
scrapers/ccpl.ts
157
scrapers/ccpl.ts
@ -2,48 +2,14 @@ const puppeteer = require('puppeteer');
|
|||||||
import * as fs from 'fs';
|
import * as fs from 'fs';
|
||||||
|
|
||||||
// configure database
|
// configure database
|
||||||
const mongoose = require('mongoose');
|
|
||||||
const EventMobilizon = require('./../models/Event');
|
|
||||||
|
|
||||||
// get data from webpage
|
// get data from webpage
|
||||||
|
function saveScrappeddata(data: any) {
|
||||||
async function run() {
|
|
||||||
|
|
||||||
let options = {};
|
|
||||||
options = {headless: false, devtools: true }
|
|
||||||
const browser = await puppeteer.launch(options);
|
|
||||||
const page = await browser.newPage();
|
|
||||||
|
|
||||||
|
|
||||||
await page.goto('https://www.cc-paysdelimours.fr/agenda');
|
|
||||||
|
|
||||||
// chaque lien d'évènement est un .widgit_result
|
|
||||||
// titre: .widgit_result .widgit_title
|
|
||||||
// titre: #widgit_event_details .widgit_title
|
|
||||||
|
|
||||||
|
|
||||||
await page.evaluate(() => {
|
|
||||||
const scrapped: any = {};
|
|
||||||
let sel = '#widgit_results_agenda .widgit_result .title';
|
|
||||||
|
|
||||||
let listOfElements: any = document.querySelectorAll(sel);
|
|
||||||
let titleList: any = [];
|
|
||||||
|
|
||||||
|
|
||||||
if (listOfElements.length) {
|
|
||||||
listOfElements.forEach((elem: any) => {
|
|
||||||
console.log('title', elem.innerHTML);
|
|
||||||
titleList.push(elem.innerHTML);
|
|
||||||
})
|
|
||||||
}
|
|
||||||
scrapped.selector = sel;
|
|
||||||
scrapped.titleList = titleList;
|
|
||||||
// sauver les contenus dans un fichier json
|
|
||||||
let fileName = 'ccpl_scrapped.json';
|
let fileName = 'ccpl_scrapped.json';
|
||||||
|
|
||||||
fs.writeFile(
|
fs.writeFile(
|
||||||
`./sources_examples/${fileName}`,
|
`./sources_examples/${fileName}`,
|
||||||
JSON.stringify(scrapped, null, 4),
|
JSON.stringify(data, null, 4),
|
||||||
"utf8",
|
"utf8",
|
||||||
(err: any) => {
|
(err: any) => {
|
||||||
if (err) {
|
if (err) {
|
||||||
@ -53,30 +19,115 @@ async function run() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
let options: any = {};
|
||||||
|
let scrappedData: any = {
|
||||||
|
pages: [],
|
||||||
|
titleList: null,
|
||||||
|
linkTitleEvent: null
|
||||||
|
};
|
||||||
|
options = {headless: false, devtools: true}
|
||||||
|
|
||||||
|
|
||||||
|
async function run() {
|
||||||
|
|
||||||
|
const browser = await puppeteer.launch(options);
|
||||||
|
const page = await browser.newPage();
|
||||||
|
await page.goto('https://www.cc-paysdelimours.fr/agenda');
|
||||||
|
|
||||||
|
// chaque lien d'évènement est un .widgit_result
|
||||||
|
// titre: #widgit_event_details .widgit_title
|
||||||
|
|
||||||
|
|
||||||
|
let scrappedData = await page.evaluate(() => {
|
||||||
|
|
||||||
|
|
||||||
|
let sel = '#widgit_results_agenda .widgit_result .title';
|
||||||
|
let listOfElements: any = document.querySelectorAll(sel);
|
||||||
|
let linkTitleEvent: any = [];
|
||||||
|
let titleList: any = [];
|
||||||
|
|
||||||
|
|
||||||
|
if (listOfElements.length) {
|
||||||
|
listOfElements.forEach((elem: any) => {
|
||||||
|
console.log('title', elem.innerHTML);
|
||||||
|
titleList.push(elem.innerHTML);
|
||||||
|
linkTitleEvent.push(elem.attribute['href']);
|
||||||
|
})
|
||||||
|
}
|
||||||
|
const scrapped: any = {
|
||||||
|
selector: sel,
|
||||||
|
titleList,
|
||||||
|
linkTitleEvent
|
||||||
|
};
|
||||||
|
|
||||||
console.log('titleList', titleList);
|
console.log('titleList', titleList);
|
||||||
return titleList;
|
return scrapped;
|
||||||
});
|
});
|
||||||
// browser.close();
|
|
||||||
|
|
||||||
|
// scrappedData.linkTitleEvent.forEach((url: string) => {
|
||||||
|
// // add delay
|
||||||
|
// getEventPageInfo(url)
|
||||||
|
// })
|
||||||
|
|
||||||
|
browser.close();
|
||||||
console.log('DONE');
|
console.log('DONE');
|
||||||
}
|
}
|
||||||
|
|
||||||
function upsertEventMobilizon(eventObject:any) {
|
async function getEventPageInfo(url: string) {
|
||||||
|
const browser = await puppeteer.launch(options);
|
||||||
const DB_URL = 'mongodb://localhost/thal';
|
const page = await browser.newPage();
|
||||||
|
await page.goto(url);
|
||||||
if (mongoose.connection.readyState == 0) { mongoose.connect(DB_URL); }
|
let eventInfo = await page.evaluate(() => {
|
||||||
|
let description = '';
|
||||||
// if this email exists, update the entry, don't insert
|
document.querySelectorAll('.desc').forEach((element: any) => {
|
||||||
let conditions = { title: eventObject.title };
|
description += element.innerHTML;
|
||||||
let options = { upsert: true, new: true, setDefaultsOnInsert: true };
|
})
|
||||||
|
let websiteNode: any = document.querySelector('.website');
|
||||||
EventMobilizon.findOneAndUpdate(conditions, eventObject, options, (err:any, result:any) => {
|
let mapNode: any = document.querySelector('#details_map');
|
||||||
if (err){
|
let filesNode: any = document.querySelector('.attachments a');
|
||||||
console.log('result,err', result,err);
|
let addressParagraphs: any = document.querySelectorAll('.contact p');
|
||||||
throw err;
|
// @ts-ignore
|
||||||
|
let eventInfo: any = {
|
||||||
|
title: document.querySelector('.widgit_title').innerHTML,
|
||||||
|
description,
|
||||||
|
website: websiteNode.getAttribute('href'),
|
||||||
|
map_latitude: mapNode.getAttribute('data-lat')
|
||||||
|
,
|
||||||
|
map_longitude: mapNode.getAttribute('data-lng'),
|
||||||
|
files: filesNode.getAttribute('href'),
|
||||||
|
contact_all: document.querySelector('.contact').innerHTML,
|
||||||
|
address: addressParagraphs[0]?.innerText + addressParagraphs[1]?.innerText + addressParagraphs[2]?.innerText,
|
||||||
|
phone: document.querySelector('.mbvs').innerHTML,
|
||||||
|
date: document.querySelector('.openings .pre').innerHTML,
|
||||||
}
|
}
|
||||||
});
|
console.log('eventInfo inside', eventInfo);
|
||||||
|
//
|
||||||
|
// saveScrappeddata(eventInfo);
|
||||||
|
// @ts-ignore
|
||||||
|
return eventInfo;
|
||||||
|
})
|
||||||
|
console.log('eventInfo outside', eventInfo);
|
||||||
|
scrappedData.pages.push(eventInfo);
|
||||||
|
|
||||||
|
browser.close();
|
||||||
|
return eventInfo;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
run();
|
run();
|
||||||
|
|
||||||
|
async function getOnePage() {
|
||||||
|
|
||||||
|
let eventInfo = await getEventPageInfo('https://www.cc-paysdelimours.fr/agenda#widget-details-offre-4718535');
|
||||||
|
|
||||||
|
// sauver les contenus dans un fichier json
|
||||||
|
console.log('scrappedData outside', scrappedData);
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// getOnePage()
|
||||||
|
saveScrappeddata(scrappedData);
|
5
sources_examples/ccpl_scrapped.json
Normal file
5
sources_examples/ccpl_scrapped.json
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"pages": [],
|
||||||
|
"titleList": null,
|
||||||
|
"linkTitleEvent": null
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user