// récupérer des infos sur tous les chatons de chatons.org // https://www.chatons.org/chatons/all const url = 'https://www.chatons.org/chatons/all'; const axios = require('axios'); const cheerio = require('cheerio'); const pretty = require('pretty'); const fs = require('fs'); const limitPagesToFetch = 2 const pageLinkList = []; const pagesChatons = []; const departments = []; const softwaresGeneral = []; const chatonsCatalog = { linkList: [], pages : [], departments, softwaresGeneral }; // récupérer les liens de chaque chaton, exemple // https://www.chatons.org/chatons/bastet-parinux // // // infos dans la div: // .chatons-public-column // .chatons-public-subtitle nom // Async function which scrapes the data async function scrapeDataPages() { try { // Fetch HTML of the page we want to scrape const { data } = await axios.get(url); // Load HTML we fetched in the previous line const $ = cheerio.load(data); const listItems = $('.view-content a '); listItems.each((index, element) => { // console.log('element', element) if (element.attribs && element.attribs['hreflang']) { let url = 'https://www.chatons.org' + element.attribs['href']; // console.log('url', url); // console.log('element.getAttribute(\'href\')', url); pageLinkList.push(url); } }); chatonsCatalog.linkList = pageLinkList; console.log('pageLinkList.length', pageLinkList.length); let indexPage = 0; pageLinkList.forEach((urlPage) => { if (indexPage < limitPagesToFetch) { let shouldWeSave = (indexPage === limitPagesToFetch || indexPage === pageLinkList.length - 1); scrapeDataCatalogcontent(urlPage, shouldWeSave); } // console.log('indexPage', indexPage, pageLinkList.length) indexPage++; }); } catch (e) { console.error('e', e); } } function persistCatalog() { chatonsCatalog.pages = pagesChatons; // console.log('pagesChatons', pagesChatons) fs.writeFile('output/chatons_links.json', JSON.stringify(chatonsCatalog, null, 2), (err) => { if (err) { console.error(err); return; } console.log('Successfully written data to file chatons_links.json'); }); } async function scrapeDataCatalogcontent(url_page, shouldWeSave) { try { // Fetch HTML of the page we want to scrape const { data } = await axios.get(url_page); console.log('fetching url_page', url_page); // Load HTML we fetched in the previous line const $ = cheerio.load(data); // logiciels du chatons let softwares = []; console.log('$(\'.view-kitten-software ul li\').length', $('.view-kitten-software ul li .field-content a').length); $('.view-kitten-software ul li .field-content a').each((idx, el) => { // console.log('idx', idx) // console.log('el', el) let soft = { name: el ? $(el).text() : '', software_name: el ? $(el).text().split(' - ')[1] : '', link: el ? 'https://www.chatons.org' + el?.attribs['href']:"", }; if(!softwaresGeneral[$(el).text().split(' - ')[1]]){ softwaresGeneral[$(el).text().split(' - ')[1]].push({ name : $('h2.chatons-public-subtitle').eq(0).text().trim(), url : $('.field--name-field-website-url .field__item')?.text().trim(), }) } console.log('soft', soft); softwares.push(soft); }); // console.log('org', $('.field--name-field-structure-organization')?.text()); pagesChatons.push({ name : $('h2.chatons-public-subtitle').eq(0).text().trim(), url : $('.field--name-field-website-url .field__item')?.text().trim(), rss : $('.field--name-field-rss-feed .field__item')?.text().trim(), organization: $('.field--name-field-structure-organization .field__item')?.text(), structure : $('.field--name-field-structure-type .field__item')?.text(), geo_area : $('.field--name-field-geo-area .field__item')?.text(), creation : $('.field--name-field-structure-creation .field__item')?.text(), since : $('.field--name-field-member-since .field__item')?.text(), zip_code : $('.field--name-field-zip-code .field__item')?.text(), city : $('.field--name-field-city .field__item')?.text(), softwares, }); if(!departements[$('.field--name-field-zip-code .field__item')?.text()]){ departements[$('.field--name-field-zip-code .field__item')?.text()] = [] } departements[$('.field--name-field-zip-code .field__item')?.text()].push($('h2.chatons-public-subtitle').eq(0).text().trim()) if (shouldWeSave) { setTimeout(persistCatalog, 2000); } } catch (e) { console.error('e', e); } } // rundown all the pages scrapeDataPages();