get one page data

master
Tykayn 9 months ago committed by tykayn
parent 055a811441
commit 04f561a9d3
  1. 137
      scrapers/ccpl.ts
  2. 5
      sources_examples/ccpl_scrapped.json

@ -2,31 +2,50 @@ const puppeteer = require('puppeteer');
import * as fs from 'fs';
// configure database
const mongoose = require('mongoose');
const EventMobilizon = require('./../models/Event');
// get data from webpage
function saveScrappeddata(data: any) {
let fileName = 'ccpl_scrapped.json';
fs.writeFile(
`./sources_examples/${fileName}`,
JSON.stringify(data, null, 4),
"utf8",
(err: any) => {
if (err) {
console.log(`Error writing file: ${err}`);
} else {
console.log(`File ${fileName} is written successfully!`);
}
}
);
}
let options: any = {};
let scrappedData: any = {
pages: [],
titleList: null,
linkTitleEvent: null
};
options = {headless: false, devtools: true}
async function run() {
let options = {};
options = {headless: false, devtools: true }
const browser = await puppeteer.launch(options);
const page = await browser.newPage();
await page.goto('https://www.cc-paysdelimours.fr/agenda');
// chaque lien d'évènement est un .widgit_result
// titre: .widgit_result .widgit_title
// titre: #widgit_event_details .widgit_title
await page.evaluate(() => {
const scrapped: any = {};
let sel = '#widgit_results_agenda .widgit_result .title';
let scrappedData = await page.evaluate(() => {
let sel = '#widgit_results_agenda .widgit_result .title';
let listOfElements: any = document.querySelectorAll(sel);
let linkTitleEvent: any = [];
let titleList: any = [];
@ -34,49 +53,81 @@ async function run() {
listOfElements.forEach((elem: any) => {
console.log('title', elem.innerHTML);
titleList.push(elem.innerHTML);
linkTitleEvent.push(elem.attribute['href']);
})
}
scrapped.selector = sel;
scrapped.titleList = titleList;
// sauver les contenus dans un fichier json
let fileName = 'ccpl_scrapped.json';
fs.writeFile(
`./sources_examples/${fileName}`,
JSON.stringify(scrapped, null, 4),
"utf8",
(err: any) => {
if (err) {
console.log(`Error writing file: ${err}`);
} else {
console.log(`File ${fileName} is written successfully!`);
}
}
);
const scrapped: any = {
selector: sel,
titleList,
linkTitleEvent
};
console.log('titleList', titleList);
return titleList;
return scrapped;
});
// browser.close();
console.log('DONE');
}
function upsertEventMobilizon(eventObject:any) {
const DB_URL = 'mongodb://localhost/thal';
// scrappedData.linkTitleEvent.forEach((url: string) => {
// // add delay
// getEventPageInfo(url)
// })
if (mongoose.connection.readyState == 0) { mongoose.connect(DB_URL); }
browser.close();
console.log('DONE');
}
// if this email exists, update the entry, don't insert
let conditions = { title: eventObject.title };
let options = { upsert: true, new: true, setDefaultsOnInsert: true };
async function getEventPageInfo(url: string) {
const browser = await puppeteer.launch(options);
const page = await browser.newPage();
await page.goto(url);
let eventInfo = await page.evaluate(() => {
let description = '';
document.querySelectorAll('.desc').forEach((element: any) => {
description += element.innerHTML;
})
let websiteNode: any = document.querySelector('.website');
let mapNode: any = document.querySelector('#details_map');
let filesNode: any = document.querySelector('.attachments a');
let addressParagraphs: any = document.querySelectorAll('.contact p');
// @ts-ignore
let eventInfo: any = {
title: document.querySelector('.widgit_title').innerHTML,
description,
website: websiteNode.getAttribute('href'),
map_latitude: mapNode.getAttribute('data-lat')
,
map_longitude: mapNode.getAttribute('data-lng'),
files: filesNode.getAttribute('href'),
contact_all: document.querySelector('.contact').innerHTML,
address: addressParagraphs[0]?.innerText + addressParagraphs[1]?.innerText + addressParagraphs[2]?.innerText,
phone: document.querySelector('.mbvs').innerHTML,
date: document.querySelector('.openings .pre').innerHTML,
}
console.log('eventInfo inside', eventInfo);
//
// saveScrappeddata(eventInfo);
// @ts-ignore
return eventInfo;
})
console.log('eventInfo outside', eventInfo);
scrappedData.pages.push(eventInfo);
browser.close();
return eventInfo;
EventMobilizon.findOneAndUpdate(conditions, eventObject, options, (err:any, result:any) => {
if (err){
console.log('result,err', result,err);
throw err;
}
});
}
run();
async function getOnePage() {
let eventInfo = await getEventPageInfo('https://www.cc-paysdelimours.fr/agenda#widget-details-offre-4718535');
// sauver les contenus dans un fichier json
console.log('scrappedData outside', scrappedData);
}
// getOnePage()
saveScrappeddata(scrappedData);

@ -0,0 +1,5 @@
{
"pages": [],
"titleList": null,
"linkTitleEvent": null
}
Loading…
Cancel
Save