Compare commits
4 Commits
b02786046d
...
15c83d8636
Author | SHA1 | Date |
---|---|---|
Tykayn | 15c83d8636 | |
Tykayn | ccaf76f8be | |
Tykayn | f14cf2c4f2 | |
Tykayn | d6a41df40c |
72
chatons.js
72
chatons.js
|
@ -7,11 +7,17 @@ const cheerio = require('cheerio');
|
|||
const pretty = require('pretty');
|
||||
const fs = require('fs');
|
||||
|
||||
const limitPagesToFetch = 200;
|
||||
|
||||
const pageLinkList = [];
|
||||
const pagesChatons = [];
|
||||
const departements = {};
|
||||
const softwaresGeneral = {};
|
||||
const chatonsCatalog = {
|
||||
linkList: [],
|
||||
pages : [],
|
||||
departements,
|
||||
softwaresGeneral,
|
||||
};
|
||||
|
||||
// récupérer les liens de chaque chaton, exemple
|
||||
|
@ -47,15 +53,15 @@ async function scrapeDataPages() {
|
|||
|
||||
console.log('pageLinkList.length', pageLinkList.length);
|
||||
let indexPage = 0;
|
||||
|
||||
pageLinkList.forEach((urlPage) => {
|
||||
if (indexPage < 100) {
|
||||
let shouldWeSave = indexPage === pageLinkList.length - 1
|
||||
if (indexPage <= limitPagesToFetch) {
|
||||
let shouldWeSave = (indexPage === limitPagesToFetch || indexPage === pageLinkList.length - 1);
|
||||
|
||||
scrapeDataCatalogcontent(urlPage, shouldWeSave);
|
||||
|
||||
}
|
||||
|
||||
|
||||
// console.log('indexPage', indexPage, pageLinkList.length)
|
||||
indexPage++;
|
||||
});
|
||||
|
@ -67,6 +73,7 @@ async function scrapeDataPages() {
|
|||
|
||||
function persistCatalog() {
|
||||
|
||||
console.log('saving catalog...');
|
||||
chatonsCatalog.pages = pagesChatons;
|
||||
// console.log('pagesChatons', pagesChatons)
|
||||
fs.writeFile('output/chatons_links.json', JSON.stringify(chatonsCatalog, null, 2), (err) => {
|
||||
|
@ -87,21 +94,53 @@ async function scrapeDataCatalogcontent(url_page, shouldWeSave) {
|
|||
// Load HTML we fetched in the previous line
|
||||
const $ = cheerio.load(data);
|
||||
|
||||
|
||||
// logiciels du chatons
|
||||
|
||||
let softwares =[]
|
||||
$('.view-kitten-software ul li a').each((idx, el) => {
|
||||
// console.log('idx', idx)
|
||||
// console.log('el', el)
|
||||
softwares.push({
|
||||
name: el.textContent,
|
||||
link: el.attribs['href']
|
||||
})
|
||||
})
|
||||
// console.log('org', $('.field--name-field-structure-organization')?.text());
|
||||
let softwares = [];
|
||||
|
||||
let chaton_name = $('h2.chatons-public-subtitle').eq(0).text().trim();
|
||||
|
||||
|
||||
// stats départements
|
||||
let dep = $('.field--name-field-zip-code .field__item')?.text().substring(0, 2);
|
||||
|
||||
if (dep != null && chaton_name && chaton_name.length) {
|
||||
if (!departements[dep]) {
|
||||
console.log('ajout département ',dep)
|
||||
departements[dep] = [];
|
||||
}
|
||||
console.log('zip', dep, chaton_name);
|
||||
departements[dep].push(chaton_name);
|
||||
console.log('departements', departements)
|
||||
}
|
||||
|
||||
// propriétés de la page chaton
|
||||
$('.view-kitten-software ul li .field-content a').each((idx, el) => {
|
||||
|
||||
let soft_name = el ? $(el).text().split(' - ')[1] : '';
|
||||
let soft = {
|
||||
name : el ? $(el).text() : '',
|
||||
software_name: soft_name,
|
||||
link : el ? 'https://www.chatons.org' + el?.attribs['href'] : '',
|
||||
};
|
||||
|
||||
softwares.push(soft);
|
||||
if (soft_name) {
|
||||
|
||||
console.log('soft_name', soft_name)
|
||||
if (!softwaresGeneral[soft_name]) {
|
||||
softwaresGeneral[soft_name] = [];
|
||||
|
||||
}
|
||||
softwaresGeneral[soft_name].push({
|
||||
name: chaton_name,
|
||||
url : $('.field--name-field-website-url .field__item')?.text().trim(),
|
||||
});
|
||||
}
|
||||
});
|
||||
pagesChatons.push({
|
||||
name: $('h2.chatons-public-subtitle').eq(0).text().trim(),
|
||||
chatons_url : url_page,
|
||||
name : chaton_name,
|
||||
url : $('.field--name-field-website-url .field__item')?.text().trim(),
|
||||
rss : $('.field--name-field-rss-feed .field__item')?.text().trim(),
|
||||
organization: $('.field--name-field-structure-organization .field__item')?.text(),
|
||||
|
@ -114,9 +153,10 @@ async function scrapeDataCatalogcontent(url_page, shouldWeSave) {
|
|||
softwares,
|
||||
});
|
||||
|
||||
|
||||
if (shouldWeSave) {
|
||||
|
||||
setTimeout(persistCatalog,2000)
|
||||
setTimeout(persistCatalog, 2000);
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('e', e);
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue