get one page data
This commit is contained in:
parent
055a811441
commit
04f561a9d3
157
scrapers/ccpl.ts
157
scrapers/ccpl.ts
@ -2,48 +2,14 @@ const puppeteer = require('puppeteer');
|
||||
import * as fs from 'fs';
|
||||
|
||||
// configure database
|
||||
const mongoose = require('mongoose');
|
||||
const EventMobilizon = require('./../models/Event');
|
||||
|
||||
// get data from webpage
|
||||
|
||||
async function run() {
|
||||
|
||||
let options = {};
|
||||
options = {headless: false, devtools: true }
|
||||
const browser = await puppeteer.launch(options);
|
||||
const page = await browser.newPage();
|
||||
|
||||
|
||||
await page.goto('https://www.cc-paysdelimours.fr/agenda');
|
||||
|
||||
// chaque lien d'évènement est un .widgit_result
|
||||
// titre: .widgit_result .widgit_title
|
||||
// titre: #widgit_event_details .widgit_title
|
||||
|
||||
|
||||
await page.evaluate(() => {
|
||||
const scrapped: any = {};
|
||||
let sel = '#widgit_results_agenda .widgit_result .title';
|
||||
|
||||
let listOfElements: any = document.querySelectorAll(sel);
|
||||
let titleList: any = [];
|
||||
|
||||
|
||||
if (listOfElements.length) {
|
||||
listOfElements.forEach((elem: any) => {
|
||||
console.log('title', elem.innerHTML);
|
||||
titleList.push(elem.innerHTML);
|
||||
})
|
||||
}
|
||||
scrapped.selector = sel;
|
||||
scrapped.titleList = titleList;
|
||||
// sauver les contenus dans un fichier json
|
||||
function saveScrappeddata(data: any) {
|
||||
let fileName = 'ccpl_scrapped.json';
|
||||
|
||||
fs.writeFile(
|
||||
`./sources_examples/${fileName}`,
|
||||
JSON.stringify(scrapped, null, 4),
|
||||
JSON.stringify(data, null, 4),
|
||||
"utf8",
|
||||
(err: any) => {
|
||||
if (err) {
|
||||
@ -53,30 +19,115 @@ async function run() {
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
let options: any = {};
|
||||
let scrappedData: any = {
|
||||
pages: [],
|
||||
titleList: null,
|
||||
linkTitleEvent: null
|
||||
};
|
||||
options = {headless: false, devtools: true}
|
||||
|
||||
|
||||
async function run() {
|
||||
|
||||
const browser = await puppeteer.launch(options);
|
||||
const page = await browser.newPage();
|
||||
await page.goto('https://www.cc-paysdelimours.fr/agenda');
|
||||
|
||||
// chaque lien d'évènement est un .widgit_result
|
||||
// titre: #widgit_event_details .widgit_title
|
||||
|
||||
|
||||
let scrappedData = await page.evaluate(() => {
|
||||
|
||||
|
||||
let sel = '#widgit_results_agenda .widgit_result .title';
|
||||
let listOfElements: any = document.querySelectorAll(sel);
|
||||
let linkTitleEvent: any = [];
|
||||
let titleList: any = [];
|
||||
|
||||
|
||||
if (listOfElements.length) {
|
||||
listOfElements.forEach((elem: any) => {
|
||||
console.log('title', elem.innerHTML);
|
||||
titleList.push(elem.innerHTML);
|
||||
linkTitleEvent.push(elem.attribute['href']);
|
||||
})
|
||||
}
|
||||
const scrapped: any = {
|
||||
selector: sel,
|
||||
titleList,
|
||||
linkTitleEvent
|
||||
};
|
||||
|
||||
console.log('titleList', titleList);
|
||||
return titleList;
|
||||
return scrapped;
|
||||
});
|
||||
// browser.close();
|
||||
|
||||
|
||||
// scrappedData.linkTitleEvent.forEach((url: string) => {
|
||||
// // add delay
|
||||
// getEventPageInfo(url)
|
||||
// })
|
||||
|
||||
browser.close();
|
||||
console.log('DONE');
|
||||
}
|
||||
|
||||
function upsertEventMobilizon(eventObject:any) {
|
||||
|
||||
const DB_URL = 'mongodb://localhost/thal';
|
||||
|
||||
if (mongoose.connection.readyState == 0) { mongoose.connect(DB_URL); }
|
||||
|
||||
// if this email exists, update the entry, don't insert
|
||||
let conditions = { title: eventObject.title };
|
||||
let options = { upsert: true, new: true, setDefaultsOnInsert: true };
|
||||
|
||||
EventMobilizon.findOneAndUpdate(conditions, eventObject, options, (err:any, result:any) => {
|
||||
if (err){
|
||||
console.log('result,err', result,err);
|
||||
throw err;
|
||||
async function getEventPageInfo(url: string) {
|
||||
const browser = await puppeteer.launch(options);
|
||||
const page = await browser.newPage();
|
||||
await page.goto(url);
|
||||
let eventInfo = await page.evaluate(() => {
|
||||
let description = '';
|
||||
document.querySelectorAll('.desc').forEach((element: any) => {
|
||||
description += element.innerHTML;
|
||||
})
|
||||
let websiteNode: any = document.querySelector('.website');
|
||||
let mapNode: any = document.querySelector('#details_map');
|
||||
let filesNode: any = document.querySelector('.attachments a');
|
||||
let addressParagraphs: any = document.querySelectorAll('.contact p');
|
||||
// @ts-ignore
|
||||
let eventInfo: any = {
|
||||
title: document.querySelector('.widgit_title').innerHTML,
|
||||
description,
|
||||
website: websiteNode.getAttribute('href'),
|
||||
map_latitude: mapNode.getAttribute('data-lat')
|
||||
,
|
||||
map_longitude: mapNode.getAttribute('data-lng'),
|
||||
files: filesNode.getAttribute('href'),
|
||||
contact_all: document.querySelector('.contact').innerHTML,
|
||||
address: addressParagraphs[0]?.innerText + addressParagraphs[1]?.innerText + addressParagraphs[2]?.innerText,
|
||||
phone: document.querySelector('.mbvs').innerHTML,
|
||||
date: document.querySelector('.openings .pre').innerHTML,
|
||||
}
|
||||
});
|
||||
console.log('eventInfo inside', eventInfo);
|
||||
//
|
||||
// saveScrappeddata(eventInfo);
|
||||
// @ts-ignore
|
||||
return eventInfo;
|
||||
})
|
||||
console.log('eventInfo outside', eventInfo);
|
||||
scrappedData.pages.push(eventInfo);
|
||||
|
||||
browser.close();
|
||||
return eventInfo;
|
||||
|
||||
}
|
||||
|
||||
run();
|
||||
|
||||
async function getOnePage() {
|
||||
|
||||
let eventInfo = await getEventPageInfo('https://www.cc-paysdelimours.fr/agenda#widget-details-offre-4718535');
|
||||
|
||||
// sauver les contenus dans un fichier json
|
||||
console.log('scrappedData outside', scrappedData);
|
||||
|
||||
|
||||
}
|
||||
|
||||
// getOnePage()
|
||||
saveScrappeddata(scrappedData);
|
5
sources_examples/ccpl_scrapped.json
Normal file
5
sources_examples/ccpl_scrapped.json
Normal file
@ -0,0 +1,5 @@
|
||||
{
|
||||
"pages": [],
|
||||
"titleList": null,
|
||||
"linkTitleEvent": null
|
||||
}
|
Loading…
Reference in New Issue
Block a user