framalibre-scraping/chatons.js

151 lines
4.7 KiB
JavaScript
Raw Normal View History

2022-07-11 10:50:56 +02:00
// récupérer des infos sur tous les chatons de chatons.org
// https://www.chatons.org/chatons/all
const url = 'https://www.chatons.org/chatons/all';
const axios = require('axios');
const cheerio = require('cheerio');
const pretty = require('pretty');
const fs = require('fs');
2022-07-11 12:57:18 +02:00
const limitPagesToFetch = 2
2022-07-11 10:50:56 +02:00
const pageLinkList = [];
2022-07-11 11:29:09 +02:00
const pagesChatons = [];
2022-07-11 12:50:36 +02:00
const departments = [];
2022-07-11 12:57:18 +02:00
const softwaresGeneral = [];
2022-07-11 10:50:56 +02:00
const chatonsCatalog = {
linkList: [],
pages : [],
2022-07-11 12:50:36 +02:00
departments,
2022-07-11 12:57:18 +02:00
softwaresGeneral
2022-07-11 10:50:56 +02:00
};
// récupérer les liens de chaque chaton, exemple
// https://www.chatons.org/chatons/bastet-parinux
//
//
// infos dans la div:
// .chatons-public-column
// .chatons-public-subtitle nom
// Async function which scrapes the data
async function scrapeDataPages() {
try {
// Fetch HTML of the page we want to scrape
const { data } = await axios.get(url);
// Load HTML we fetched in the previous line
const $ = cheerio.load(data);
const listItems = $('.view-content a ');
listItems.each((index, element) => {
// console.log('element', element)
if (element.attribs && element.attribs['hreflang']) {
let url = 'https://www.chatons.org' + element.attribs['href'];
2022-07-11 11:29:09 +02:00
// console.log('url', url);
// console.log('element.getAttribute(\'href\')', url);
2022-07-11 10:50:56 +02:00
pageLinkList.push(url);
}
});
chatonsCatalog.linkList = pageLinkList;
2022-07-11 11:29:09 +02:00
console.log('pageLinkList.length', pageLinkList.length);
let indexPage = 0;
2022-07-11 12:57:18 +02:00
2022-07-11 11:29:09 +02:00
pageLinkList.forEach((urlPage) => {
2022-07-11 12:57:18 +02:00
if (indexPage < limitPagesToFetch) {
let shouldWeSave = (indexPage === limitPagesToFetch || indexPage === pageLinkList.length - 1);
2022-07-11 10:50:56 +02:00
2022-07-11 11:29:09 +02:00
scrapeDataCatalogcontent(urlPage, shouldWeSave);
2022-07-11 10:50:56 +02:00
}
2022-07-11 11:29:09 +02:00
// console.log('indexPage', indexPage, pageLinkList.length)
indexPage++;
2022-07-11 10:50:56 +02:00
});
} catch (e) {
console.error('e', e);
}
}
function persistCatalog() {
2022-07-11 11:29:09 +02:00
chatonsCatalog.pages = pagesChatons;
// console.log('pagesChatons', pagesChatons)
2022-07-11 10:50:56 +02:00
fs.writeFile('output/chatons_links.json', JSON.stringify(chatonsCatalog, null, 2), (err) => {
if (err) {
console.error(err);
return;
}
console.log('Successfully written data to file chatons_links.json');
});
}
2022-07-11 11:29:09 +02:00
async function scrapeDataCatalogcontent(url_page, shouldWeSave) {
2022-07-11 10:50:56 +02:00
try {
// Fetch HTML of the page we want to scrape
const { data } = await axios.get(url_page);
console.log('fetching url_page', url_page);
// Load HTML we fetched in the previous line
const $ = cheerio.load(data);
2022-07-11 11:29:09 +02:00
// logiciels du chatons
2022-07-11 12:50:36 +02:00
let softwares = [];
console.log('$(\'.view-kitten-software ul li\').length', $('.view-kitten-software ul li .field-content a').length);
$('.view-kitten-software ul li .field-content a').each((idx, el) => {
2022-07-11 11:29:09 +02:00
// console.log('idx', idx)
// console.log('el', el)
2022-07-11 12:50:36 +02:00
let soft = {
name: el ? $(el).text() : '',
software_name: el ? $(el).text().split(' - ')[1] : '',
link: el ? 'https://www.chatons.org' + el?.attribs['href']:"",
};
2022-07-11 12:57:18 +02:00
if(!softwaresGeneral[$(el).text().split(' - ')[1]]){
softwaresGeneral[$(el).text().split(' - ')[1]].push({
name : $('h2.chatons-public-subtitle').eq(0).text().trim(),
url : $('.field--name-field-website-url .field__item')?.text().trim(),
})
}
2022-07-11 12:50:36 +02:00
console.log('soft', soft);
softwares.push(soft);
});
2022-07-11 11:29:09 +02:00
// console.log('org', $('.field--name-field-structure-organization')?.text());
pagesChatons.push({
2022-07-11 12:50:36 +02:00
name : $('h2.chatons-public-subtitle').eq(0).text().trim(),
url : $('.field--name-field-website-url .field__item')?.text().trim(),
rss : $('.field--name-field-rss-feed .field__item')?.text().trim(),
organization: $('.field--name-field-structure-organization .field__item')?.text(),
structure : $('.field--name-field-structure-type .field__item')?.text(),
geo_area : $('.field--name-field-geo-area .field__item')?.text(),
creation : $('.field--name-field-structure-creation .field__item')?.text(),
since : $('.field--name-field-member-since .field__item')?.text(),
zip_code : $('.field--name-field-zip-code .field__item')?.text(),
city : $('.field--name-field-city .field__item')?.text(),
2022-07-11 11:29:09 +02:00
softwares,
2022-07-11 10:50:56 +02:00
});
2022-07-11 11:29:09 +02:00
2022-07-11 12:57:18 +02:00
if(!departements[$('.field--name-field-zip-code .field__item')?.text()]){
departements[$('.field--name-field-zip-code .field__item')?.text()] = []
}
departements[$('.field--name-field-zip-code .field__item')?.text()].push($('h2.chatons-public-subtitle').eq(0).text().trim())
2022-07-11 12:50:36 +02:00
if (shouldWeSave) {
2022-07-11 11:29:09 +02:00
2022-07-11 12:50:36 +02:00
setTimeout(persistCatalog, 2000);
2022-07-11 11:29:09 +02:00
}
2022-07-11 10:50:56 +02:00
} catch (e) {
console.error('e', e);
}
}
// rundown all the pages
scrapeDataPages();