2022-07-11 10:50:56 +02:00
|
|
|
// récupérer des infos sur tous les chatons de chatons.org
|
|
|
|
// https://www.chatons.org/chatons/all
|
|
|
|
const url = 'https://www.chatons.org/chatons/all';
|
|
|
|
|
|
|
|
const axios = require('axios');
|
|
|
|
const cheerio = require('cheerio');
|
|
|
|
const pretty = require('pretty');
|
|
|
|
const fs = require('fs');
|
|
|
|
|
2022-07-11 13:19:00 +02:00
|
|
|
const limitPagesToFetch = 200;
|
2022-07-11 12:57:18 +02:00
|
|
|
|
2022-07-11 10:50:56 +02:00
|
|
|
const pageLinkList = [];
|
2022-07-11 11:29:09 +02:00
|
|
|
const pagesChatons = [];
|
2022-07-11 13:19:00 +02:00
|
|
|
const departements = {};
|
|
|
|
const softwaresGeneral = {};
|
2022-07-11 10:50:56 +02:00
|
|
|
const chatonsCatalog = {
|
|
|
|
linkList: [],
|
|
|
|
pages : [],
|
2022-07-11 13:19:00 +02:00
|
|
|
departements,
|
|
|
|
softwaresGeneral,
|
2022-07-11 10:50:56 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
// récupérer les liens de chaque chaton, exemple
|
|
|
|
// https://www.chatons.org/chatons/bastet-parinux
|
|
|
|
//
|
|
|
|
//
|
|
|
|
// infos dans la div:
|
|
|
|
// .chatons-public-column
|
|
|
|
// .chatons-public-subtitle nom
|
|
|
|
|
|
|
|
// Async function which scrapes the data
|
|
|
|
async function scrapeDataPages() {
|
|
|
|
try {
|
|
|
|
// Fetch HTML of the page we want to scrape
|
|
|
|
const { data } = await axios.get(url);
|
|
|
|
// Load HTML we fetched in the previous line
|
|
|
|
const $ = cheerio.load(data);
|
|
|
|
|
|
|
|
const listItems = $('.view-content a ');
|
|
|
|
|
|
|
|
listItems.each((index, element) => {
|
|
|
|
// console.log('element', element)
|
|
|
|
if (element.attribs && element.attribs['hreflang']) {
|
|
|
|
|
|
|
|
let url = 'https://www.chatons.org' + element.attribs['href'];
|
2022-07-11 11:29:09 +02:00
|
|
|
// console.log('url', url);
|
|
|
|
// console.log('element.getAttribute(\'href\')', url);
|
2022-07-11 10:50:56 +02:00
|
|
|
pageLinkList.push(url);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
chatonsCatalog.linkList = pageLinkList;
|
|
|
|
|
2022-07-11 11:29:09 +02:00
|
|
|
console.log('pageLinkList.length', pageLinkList.length);
|
|
|
|
let indexPage = 0;
|
2022-07-11 12:57:18 +02:00
|
|
|
|
2022-07-11 11:29:09 +02:00
|
|
|
pageLinkList.forEach((urlPage) => {
|
2022-07-11 13:19:00 +02:00
|
|
|
if (indexPage <= limitPagesToFetch) {
|
2022-07-11 12:57:18 +02:00
|
|
|
let shouldWeSave = (indexPage === limitPagesToFetch || indexPage === pageLinkList.length - 1);
|
2022-07-11 10:50:56 +02:00
|
|
|
|
2022-07-11 11:29:09 +02:00
|
|
|
scrapeDataCatalogcontent(urlPage, shouldWeSave);
|
2022-07-11 10:50:56 +02:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2022-07-11 11:29:09 +02:00
|
|
|
// console.log('indexPage', indexPage, pageLinkList.length)
|
|
|
|
indexPage++;
|
2022-07-11 10:50:56 +02:00
|
|
|
});
|
|
|
|
|
|
|
|
} catch (e) {
|
|
|
|
console.error('e', e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
function persistCatalog() {
|
|
|
|
|
2022-07-11 13:19:00 +02:00
|
|
|
console.log('saving catalog...');
|
2022-07-11 11:29:09 +02:00
|
|
|
chatonsCatalog.pages = pagesChatons;
|
|
|
|
// console.log('pagesChatons', pagesChatons)
|
2022-07-11 10:50:56 +02:00
|
|
|
fs.writeFile('output/chatons_links.json', JSON.stringify(chatonsCatalog, null, 2), (err) => {
|
|
|
|
if (err) {
|
|
|
|
console.error(err);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
console.log('Successfully written data to file chatons_links.json');
|
|
|
|
});
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2022-07-11 11:29:09 +02:00
|
|
|
async function scrapeDataCatalogcontent(url_page, shouldWeSave) {
|
2022-07-11 10:50:56 +02:00
|
|
|
try {
|
|
|
|
// Fetch HTML of the page we want to scrape
|
|
|
|
const { data } = await axios.get(url_page);
|
|
|
|
console.log('fetching url_page', url_page);
|
|
|
|
// Load HTML we fetched in the previous line
|
|
|
|
const $ = cheerio.load(data);
|
|
|
|
|
2022-07-11 11:29:09 +02:00
|
|
|
// logiciels du chatons
|
|
|
|
|
2022-07-11 12:50:36 +02:00
|
|
|
let softwares = [];
|
2022-07-11 13:19:00 +02:00
|
|
|
|
|
|
|
let chaton_name = $('h2.chatons-public-subtitle').eq(0).text().trim();
|
|
|
|
|
|
|
|
|
|
|
|
// stats départements
|
|
|
|
let dep = $('.field--name-field-zip-code .field__item')?.text().substring(0, 2);
|
|
|
|
|
|
|
|
if (dep != null && chaton_name && chaton_name.length) {
|
|
|
|
if (!departements[dep]) {
|
|
|
|
console.log('ajout département ',dep)
|
|
|
|
departements[dep] = [];
|
|
|
|
}
|
|
|
|
console.log('zip', dep, chaton_name);
|
|
|
|
departements[dep].push(chaton_name);
|
|
|
|
console.log('departements', departements)
|
|
|
|
}
|
|
|
|
|
|
|
|
// propriétés de la page chaton
|
2022-07-11 12:50:36 +02:00
|
|
|
$('.view-kitten-software ul li .field-content a').each((idx, el) => {
|
2022-07-11 13:19:00 +02:00
|
|
|
|
|
|
|
let soft_name = el ? $(el).text().split(' - ')[1] : '';
|
2022-07-11 12:50:36 +02:00
|
|
|
let soft = {
|
2022-07-11 13:19:00 +02:00
|
|
|
name : el ? $(el).text() : '',
|
|
|
|
software_name: soft_name,
|
|
|
|
link : el ? 'https://www.chatons.org' + el?.attribs['href'] : '',
|
2022-07-11 12:50:36 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
softwares.push(soft);
|
2022-07-11 13:19:00 +02:00
|
|
|
if (soft_name) {
|
|
|
|
|
|
|
|
console.log('soft_name', soft_name)
|
|
|
|
if (!softwaresGeneral[soft_name]) {
|
|
|
|
softwaresGeneral[soft_name] = [];
|
|
|
|
|
|
|
|
}
|
|
|
|
softwaresGeneral[soft_name].push({
|
|
|
|
name: chaton_name,
|
|
|
|
url : $('.field--name-field-website-url .field__item')?.text().trim(),
|
|
|
|
});
|
|
|
|
}
|
2022-07-11 12:50:36 +02:00
|
|
|
});
|
2022-07-11 11:29:09 +02:00
|
|
|
pagesChatons.push({
|
2022-07-11 13:19:00 +02:00
|
|
|
chatons_url : url_page,
|
|
|
|
name : chaton_name,
|
2022-07-11 12:50:36 +02:00
|
|
|
url : $('.field--name-field-website-url .field__item')?.text().trim(),
|
|
|
|
rss : $('.field--name-field-rss-feed .field__item')?.text().trim(),
|
|
|
|
organization: $('.field--name-field-structure-organization .field__item')?.text(),
|
|
|
|
structure : $('.field--name-field-structure-type .field__item')?.text(),
|
|
|
|
geo_area : $('.field--name-field-geo-area .field__item')?.text(),
|
|
|
|
creation : $('.field--name-field-structure-creation .field__item')?.text(),
|
|
|
|
since : $('.field--name-field-member-since .field__item')?.text(),
|
|
|
|
zip_code : $('.field--name-field-zip-code .field__item')?.text(),
|
|
|
|
city : $('.field--name-field-city .field__item')?.text(),
|
2022-07-11 11:29:09 +02:00
|
|
|
softwares,
|
2022-07-11 10:50:56 +02:00
|
|
|
});
|
2022-07-11 11:29:09 +02:00
|
|
|
|
2022-07-11 12:57:18 +02:00
|
|
|
|
2022-07-11 12:50:36 +02:00
|
|
|
if (shouldWeSave) {
|
2022-07-11 11:29:09 +02:00
|
|
|
|
2022-07-11 12:50:36 +02:00
|
|
|
setTimeout(persistCatalog, 2000);
|
2022-07-11 11:29:09 +02:00
|
|
|
}
|
2022-07-11 10:50:56 +02:00
|
|
|
} catch (e) {
|
|
|
|
console.error('e', e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// rundown all the pages
|
|
|
|
scrapeDataPages();
|