get one page data

This commit is contained in:
Tykayn 2022-01-12 00:08:20 +01:00 committed by tykayn
parent 055a811441
commit 04f561a9d3
2 changed files with 98 additions and 42 deletions

View File

@ -2,31 +2,50 @@ const puppeteer = require('puppeteer');
import * as fs from 'fs'; import * as fs from 'fs';
// configure database // configure database
const mongoose = require('mongoose');
const EventMobilizon = require('./../models/Event');
// get data from webpage // get data from webpage
function saveScrappeddata(data: any) {
let fileName = 'ccpl_scrapped.json';
fs.writeFile(
`./sources_examples/${fileName}`,
JSON.stringify(data, null, 4),
"utf8",
(err: any) => {
if (err) {
console.log(`Error writing file: ${err}`);
} else {
console.log(`File ${fileName} is written successfully!`);
}
}
);
}
let options: any = {};
let scrappedData: any = {
pages: [],
titleList: null,
linkTitleEvent: null
};
options = {headless: false, devtools: true}
async function run() { async function run() {
let options = {};
options = {headless: false, devtools: true }
const browser = await puppeteer.launch(options); const browser = await puppeteer.launch(options);
const page = await browser.newPage(); const page = await browser.newPage();
await page.goto('https://www.cc-paysdelimours.fr/agenda'); await page.goto('https://www.cc-paysdelimours.fr/agenda');
// chaque lien d'évènement est un .widgit_result // chaque lien d'évènement est un .widgit_result
// titre: .widgit_result .widgit_title
// titre: #widgit_event_details .widgit_title // titre: #widgit_event_details .widgit_title
await page.evaluate(() => { let scrappedData = await page.evaluate(() => {
const scrapped: any = {};
let sel = '#widgit_results_agenda .widgit_result .title';
let sel = '#widgit_results_agenda .widgit_result .title';
let listOfElements: any = document.querySelectorAll(sel); let listOfElements: any = document.querySelectorAll(sel);
let linkTitleEvent: any = [];
let titleList: any = []; let titleList: any = [];
@ -34,49 +53,81 @@ async function run() {
listOfElements.forEach((elem: any) => { listOfElements.forEach((elem: any) => {
console.log('title', elem.innerHTML); console.log('title', elem.innerHTML);
titleList.push(elem.innerHTML); titleList.push(elem.innerHTML);
linkTitleEvent.push(elem.attribute['href']);
}) })
} }
scrapped.selector = sel; const scrapped: any = {
scrapped.titleList = titleList; selector: sel,
// sauver les contenus dans un fichier json titleList,
let fileName = 'ccpl_scrapped.json'; linkTitleEvent
};
fs.writeFile(
`./sources_examples/${fileName}`,
JSON.stringify(scrapped, null, 4),
"utf8",
(err: any) => {
if (err) {
console.log(`Error writing file: ${err}`);
} else {
console.log(`File ${fileName} is written successfully!`);
}
}
);
console.log('titleList', titleList); console.log('titleList', titleList);
return titleList; return scrapped;
}); });
// browser.close();
// scrappedData.linkTitleEvent.forEach((url: string) => {
// // add delay
// getEventPageInfo(url)
// })
browser.close();
console.log('DONE'); console.log('DONE');
} }
function upsertEventMobilizon(eventObject:any) { async function getEventPageInfo(url: string) {
const browser = await puppeteer.launch(options);
const page = await browser.newPage();
await page.goto(url);
let eventInfo = await page.evaluate(() => {
let description = '';
document.querySelectorAll('.desc').forEach((element: any) => {
description += element.innerHTML;
})
let websiteNode: any = document.querySelector('.website');
let mapNode: any = document.querySelector('#details_map');
let filesNode: any = document.querySelector('.attachments a');
let addressParagraphs: any = document.querySelectorAll('.contact p');
// @ts-ignore
let eventInfo: any = {
title: document.querySelector('.widgit_title').innerHTML,
description,
website: websiteNode.getAttribute('href'),
map_latitude: mapNode.getAttribute('data-lat')
,
map_longitude: mapNode.getAttribute('data-lng'),
files: filesNode.getAttribute('href'),
contact_all: document.querySelector('.contact').innerHTML,
address: addressParagraphs[0]?.innerText + addressParagraphs[1]?.innerText + addressParagraphs[2]?.innerText,
phone: document.querySelector('.mbvs').innerHTML,
date: document.querySelector('.openings .pre').innerHTML,
}
console.log('eventInfo inside', eventInfo);
//
// saveScrappeddata(eventInfo);
// @ts-ignore
return eventInfo;
})
console.log('eventInfo outside', eventInfo);
scrappedData.pages.push(eventInfo);
const DB_URL = 'mongodb://localhost/thal'; browser.close();
return eventInfo;
if (mongoose.connection.readyState == 0) { mongoose.connect(DB_URL); }
// if this email exists, update the entry, don't insert
let conditions = { title: eventObject.title };
let options = { upsert: true, new: true, setDefaultsOnInsert: true };
EventMobilizon.findOneAndUpdate(conditions, eventObject, options, (err:any, result:any) => {
if (err){
console.log('result,err', result,err);
throw err;
}
});
} }
run(); run();
async function getOnePage() {
let eventInfo = await getEventPageInfo('https://www.cc-paysdelimours.fr/agenda#widget-details-offre-4718535');
// sauver les contenus dans un fichier json
console.log('scrappedData outside', scrappedData);
}
// getOnePage()
saveScrappeddata(scrappedData);

View File

@ -0,0 +1,5 @@
{
"pages": [],
"titleList": null,
"linkTitleEvent": null
}