// récupérer des infos sur tous les chatons de chatons.org // https://www.chatons.org/chatons/all const url = 'https://www.chatons.org/chatons/all'; const axios = require('axios'); const cheerio = require('cheerio'); const pretty = require('pretty'); const fs = require('fs'); const pageLinkList = []; const pagesChatons = []; const chatonsCatalog = { linkList: [], pages : [], }; // récupérer les liens de chaque chaton, exemple // https://www.chatons.org/chatons/bastet-parinux // // // infos dans la div: // .chatons-public-column // .chatons-public-subtitle nom // Async function which scrapes the data async function scrapeDataPages() { try { // Fetch HTML of the page we want to scrape const { data } = await axios.get(url); // Load HTML we fetched in the previous line const $ = cheerio.load(data); const listItems = $('.view-content a '); listItems.each((index, element) => { // console.log('element', element) if (element.attribs && element.attribs['hreflang']) { let url = 'https://www.chatons.org' + element.attribs['href']; // console.log('url', url); // console.log('element.getAttribute(\'href\')', url); pageLinkList.push(url); } }); chatonsCatalog.linkList = pageLinkList; console.log('pageLinkList.length', pageLinkList.length); let indexPage = 0; pageLinkList.forEach((urlPage) => { if (indexPage < 100) { let shouldWeSave = indexPage === pageLinkList.length - 1 scrapeDataCatalogcontent(urlPage, shouldWeSave); } // console.log('indexPage', indexPage, pageLinkList.length) indexPage++; }); } catch (e) { console.error('e', e); } } function persistCatalog() { chatonsCatalog.pages = pagesChatons; // console.log('pagesChatons', pagesChatons) fs.writeFile('output/chatons_links.json', JSON.stringify(chatonsCatalog, null, 2), (err) => { if (err) { console.error(err); return; } console.log('Successfully written data to file chatons_links.json'); }); } async function scrapeDataCatalogcontent(url_page, shouldWeSave) { try { // Fetch HTML of the page we want to scrape const { data } = await axios.get(url_page); console.log('fetching url_page', url_page); // Load HTML we fetched in the previous line const $ = cheerio.load(data); // logiciels du chatons let softwares =[] $('.view-kitten-software ul li a').each((idx, el) => { // console.log('idx', idx) // console.log('el', el) softwares.push({ name: el.textContent, link: el.attribs['href'] }) }) // console.log('org', $('.field--name-field-structure-organization')?.text()); pagesChatons.push({ name: $('h2.chatons-public-subtitle').eq(0).text().trim(), url : $('.field--name-field-website-url .field__item')?.text().trim(), rss : $('.field--name-field-rss-feed .field__item')?.text().trim(), organization : $('.field--name-field-structure-organization .field__item')?.text(), structure : $('.field--name-field-structure-type .field__item')?.text(), geo_area : $('.field--name-field-geo-area .field__item')?.text(), creation : $('.field--name-field-structure-creation .field__item')?.text(), since : $('.field--name-field-member-since .field__item')?.text(), zip_code : $('.field--name-field-zip-code .field__item')?.text(), city : $('.field--name-field-city .field__item')?.text(), softwares, }); if(shouldWeSave){ setTimeout(persistCatalog,2000) } } catch (e) { console.error('e', e); } } // rundown all the pages scrapeDataPages();