stats departements

This commit is contained in:
Tykayn 2022-07-11 13:19:00 +02:00 committed by caligulanorris
parent ccaf76f8be
commit 15c83d8636
2 changed files with 248 additions and 3358 deletions

View File

@ -7,18 +7,17 @@ const cheerio = require('cheerio');
const pretty = require('pretty'); const pretty = require('pretty');
const fs = require('fs'); const fs = require('fs');
const limitPagesToFetch = 200;
const limitPagesToFetch = 2
const pageLinkList = []; const pageLinkList = [];
const pagesChatons = []; const pagesChatons = [];
const departments = []; const departements = {};
const softwaresGeneral = []; const softwaresGeneral = {};
const chatonsCatalog = { const chatonsCatalog = {
linkList: [], linkList: [],
pages : [], pages : [],
departments, departements,
softwaresGeneral softwaresGeneral,
}; };
// récupérer les liens de chaque chaton, exemple // récupérer les liens de chaque chaton, exemple
@ -56,7 +55,7 @@ async function scrapeDataPages() {
let indexPage = 0; let indexPage = 0;
pageLinkList.forEach((urlPage) => { pageLinkList.forEach((urlPage) => {
if (indexPage < limitPagesToFetch) { if (indexPage <= limitPagesToFetch) {
let shouldWeSave = (indexPage === limitPagesToFetch || indexPage === pageLinkList.length - 1); let shouldWeSave = (indexPage === limitPagesToFetch || indexPage === pageLinkList.length - 1);
scrapeDataCatalogcontent(urlPage, shouldWeSave); scrapeDataCatalogcontent(urlPage, shouldWeSave);
@ -74,6 +73,7 @@ async function scrapeDataPages() {
function persistCatalog() { function persistCatalog() {
console.log('saving catalog...');
chatonsCatalog.pages = pagesChatons; chatonsCatalog.pages = pagesChatons;
// console.log('pagesChatons', pagesChatons) // console.log('pagesChatons', pagesChatons)
fs.writeFile('output/chatons_links.json', JSON.stringify(chatonsCatalog, null, 2), (err) => { fs.writeFile('output/chatons_links.json', JSON.stringify(chatonsCatalog, null, 2), (err) => {
@ -97,29 +97,50 @@ async function scrapeDataCatalogcontent(url_page, shouldWeSave) {
// logiciels du chatons // logiciels du chatons
let softwares = []; let softwares = [];
console.log('$(\'.view-kitten-software ul li\').length', $('.view-kitten-software ul li .field-content a').length);
let chaton_name = $('h2.chatons-public-subtitle').eq(0).text().trim();
// stats départements
let dep = $('.field--name-field-zip-code .field__item')?.text().substring(0, 2);
if (dep != null && chaton_name && chaton_name.length) {
if (!departements[dep]) {
console.log('ajout département ',dep)
departements[dep] = [];
}
console.log('zip', dep, chaton_name);
departements[dep].push(chaton_name);
console.log('departements', departements)
}
// propriétés de la page chaton
$('.view-kitten-software ul li .field-content a').each((idx, el) => { $('.view-kitten-software ul li .field-content a').each((idx, el) => {
// console.log('idx', idx)
// console.log('el', el) let soft_name = el ? $(el).text().split(' - ')[1] : '';
let soft = { let soft = {
name: el ? $(el).text() : '', name : el ? $(el).text() : '',
software_name: el ? $(el).text().split(' - ')[1] : '', software_name: soft_name,
link: el ? 'https://www.chatons.org' + el?.attribs['href']:"", link : el ? 'https://www.chatons.org' + el?.attribs['href'] : '',
}; };
if(!softwaresGeneral[$(el).text().split(' - ')[1]]){
softwaresGeneral[$(el).text().split(' - ')[1]].push({
name : $('h2.chatons-public-subtitle').eq(0).text().trim(),
url : $('.field--name-field-website-url .field__item')?.text().trim(),
})
}
console.log('soft', soft);
softwares.push(soft); softwares.push(soft);
if (soft_name) {
console.log('soft_name', soft_name)
if (!softwaresGeneral[soft_name]) {
softwaresGeneral[soft_name] = [];
}
softwaresGeneral[soft_name].push({
name: chaton_name,
url : $('.field--name-field-website-url .field__item')?.text().trim(),
});
}
}); });
// console.log('org', $('.field--name-field-structure-organization')?.text());
pagesChatons.push({ pagesChatons.push({
name : $('h2.chatons-public-subtitle').eq(0).text().trim(), chatons_url : url_page,
name : chaton_name,
url : $('.field--name-field-website-url .field__item')?.text().trim(), url : $('.field--name-field-website-url .field__item')?.text().trim(),
rss : $('.field--name-field-rss-feed .field__item')?.text().trim(), rss : $('.field--name-field-rss-feed .field__item')?.text().trim(),
organization: $('.field--name-field-structure-organization .field__item')?.text(), organization: $('.field--name-field-structure-organization .field__item')?.text(),
@ -132,10 +153,6 @@ async function scrapeDataCatalogcontent(url_page, shouldWeSave) {
softwares, softwares,
}); });
if(!departements[$('.field--name-field-zip-code .field__item')?.text()]){
departements[$('.field--name-field-zip-code .field__item')?.text()] = []
}
departements[$('.field--name-field-zip-code .field__item')?.text()].push($('h2.chatons-public-subtitle').eq(0).text().trim())
if (shouldWeSave) { if (shouldWeSave) {

File diff suppressed because it is too large Load Diff