Compare commits

..

No commits in common. "15c83d8636bbd501fa2b4d661a391cbc352fea7a" and "b02786046dd8666a0bb99038ce56bbc1d5287432" have entirely different histories.

3 changed files with 3365 additions and 3777 deletions

View File

@ -7,17 +7,11 @@ const cheerio = require('cheerio');
const pretty = require('pretty');
const fs = require('fs');
const limitPagesToFetch = 200;
const pageLinkList = [];
const pagesChatons = [];
const departements = {};
const softwaresGeneral = {};
const chatonsCatalog = {
linkList: [],
pages : [],
departements,
softwaresGeneral,
};
// récupérer les liens de chaque chaton, exemple
@ -53,15 +47,15 @@ async function scrapeDataPages() {
console.log('pageLinkList.length', pageLinkList.length);
let indexPage = 0;
pageLinkList.forEach((urlPage) => {
if (indexPage <= limitPagesToFetch) {
let shouldWeSave = (indexPage === limitPagesToFetch || indexPage === pageLinkList.length - 1);
if (indexPage < 100) {
let shouldWeSave = indexPage === pageLinkList.length - 1
scrapeDataCatalogcontent(urlPage, shouldWeSave);
}
// console.log('indexPage', indexPage, pageLinkList.length)
indexPage++;
});
@ -73,7 +67,6 @@ async function scrapeDataPages() {
function persistCatalog() {
console.log('saving catalog...');
chatonsCatalog.pages = pagesChatons;
// console.log('pagesChatons', pagesChatons)
fs.writeFile('output/chatons_links.json', JSON.stringify(chatonsCatalog, null, 2), (err) => {
@ -94,69 +87,36 @@ async function scrapeDataCatalogcontent(url_page, shouldWeSave) {
// Load HTML we fetched in the previous line
const $ = cheerio.load(data);
// logiciels du chatons
let softwares = [];
let chaton_name = $('h2.chatons-public-subtitle').eq(0).text().trim();
// stats départements
let dep = $('.field--name-field-zip-code .field__item')?.text().substring(0, 2);
if (dep != null && chaton_name && chaton_name.length) {
if (!departements[dep]) {
console.log('ajout département ',dep)
departements[dep] = [];
}
console.log('zip', dep, chaton_name);
departements[dep].push(chaton_name);
console.log('departements', departements)
}
// propriétés de la page chaton
$('.view-kitten-software ul li .field-content a').each((idx, el) => {
let soft_name = el ? $(el).text().split(' - ')[1] : '';
let soft = {
name : el ? $(el).text() : '',
software_name: soft_name,
link : el ? 'https://www.chatons.org' + el?.attribs['href'] : '',
};
softwares.push(soft);
if (soft_name) {
console.log('soft_name', soft_name)
if (!softwaresGeneral[soft_name]) {
softwaresGeneral[soft_name] = [];
}
softwaresGeneral[soft_name].push({
name: chaton_name,
url : $('.field--name-field-website-url .field__item')?.text().trim(),
});
}
});
let softwares =[]
$('.view-kitten-software ul li a').each((idx, el) => {
// console.log('idx', idx)
// console.log('el', el)
softwares.push({
name: el.textContent,
link: el.attribs['href']
})
})
// console.log('org', $('.field--name-field-structure-organization')?.text());
pagesChatons.push({
chatons_url : url_page,
name : chaton_name,
url : $('.field--name-field-website-url .field__item')?.text().trim(),
rss : $('.field--name-field-rss-feed .field__item')?.text().trim(),
organization: $('.field--name-field-structure-organization .field__item')?.text(),
structure : $('.field--name-field-structure-type .field__item')?.text(),
geo_area : $('.field--name-field-geo-area .field__item')?.text(),
creation : $('.field--name-field-structure-creation .field__item')?.text(),
since : $('.field--name-field-member-since .field__item')?.text(),
zip_code : $('.field--name-field-zip-code .field__item')?.text(),
city : $('.field--name-field-city .field__item')?.text(),
name: $('h2.chatons-public-subtitle').eq(0).text().trim(),
url : $('.field--name-field-website-url .field__item')?.text().trim(),
rss : $('.field--name-field-rss-feed .field__item')?.text().trim(),
organization : $('.field--name-field-structure-organization .field__item')?.text(),
structure : $('.field--name-field-structure-type .field__item')?.text(),
geo_area : $('.field--name-field-geo-area .field__item')?.text(),
creation : $('.field--name-field-structure-creation .field__item')?.text(),
since : $('.field--name-field-member-since .field__item')?.text(),
zip_code : $('.field--name-field-zip-code .field__item')?.text(),
city : $('.field--name-field-city .field__item')?.text(),
softwares,
});
if(shouldWeSave){
if (shouldWeSave) {
setTimeout(persistCatalog, 2000);
setTimeout(persistCatalog,2000)
}
} catch (e) {
console.error('e', e);

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff