framalibre-scraping/chatons.js

// récupérer des infos sur tous les chatons de chatons.org
// https://www.chatons.org/chatons/all
const url = 'https://www.chatons.org/chatons/all';

const axios = require('axios');
const cheerio = require('cheerio');
const pretty = require('pretty');
const fs = require('fs');

const pageLinkList = [];
const pagesChatons = [];
const chatonsCatalog = {
  linkList: [],
  pages   : [],
};

// récupérer les liens de chaque chaton, exemple
// https://www.chatons.org/chatons/bastet-parinux
//
//
// infos dans la div:
// .chatons-public-column
// .chatons-public-subtitle   nom

// Async function which scrapes the data
async function scrapeDataPages() {
  try {
    // Fetch HTML of the page we want to scrape
    const { data } = await axios.get(url);
    // Load HTML we fetched in the previous line
    const $ = cheerio.load(data);

    const listItems = $('.view-content a ');

    listItems.each((index, element) => {
      // console.log('element', element)
      if (element.attribs && element.attribs['hreflang']) {

        let url = 'https://www.chatons.org' + element.attribs['href'];
        // console.log('url', url);
        // console.log('element.getAttribute(\'href\')', url);
        pageLinkList.push(url);
      }
    });

    chatonsCatalog.linkList = pageLinkList;

    console.log('pageLinkList.length', pageLinkList.length);
    let indexPage = 0;
    pageLinkList.forEach((urlPage) => {
      if (indexPage < 2) {
        let shouldWeSave = indexPage === pageLinkList.length - 1

        scrapeDataCatalogcontent(urlPage, shouldWeSave);

      }


      // console.log('indexPage', indexPage, pageLinkList.length)
      indexPage++;
    });

  } catch (e) {
    console.error('e', e);
  }
}

function persistCatalog() {

  chatonsCatalog.pages = pagesChatons;
  // console.log('pagesChatons', pagesChatons)
  fs.writeFile('output/chatons_links.json', JSON.stringify(chatonsCatalog, null, 2), (err) => {
    if (err) {
      console.error(err);
      return;
    }
    console.log('Successfully written data to file chatons_links.json');
  });

}

async function scrapeDataCatalogcontent(url_page, shouldWeSave) {
  try {
    // Fetch HTML of the page we want to scrape
    const { data } = await axios.get(url_page);
    console.log('fetching url_page', url_page);
    // Load HTML we fetched in the previous line
    const $ = cheerio.load(data);


    // logiciels du chatons

    let softwares =[]
    $('.view-kitten-software ul li a').each((idx, el) => {
      // console.log('idx', idx)
      // console.log('el', el)
      softwares.push({
        name: el.valueOf(),
        link: el.attribs['href']
      })
    })
    // console.log('org', $('.field--name-field-structure-organization')?.text());
    pagesChatons.push({
      name: $('h2.chatons-public-subtitle').eq(0).text().trim(),
      url : $('.field--name-field-website-url .field__item')?.text().trim(),
      rss : $('.field--name-field-rss-feed .field__item')?.text().trim(),
      organization : $('.field--name-field-structure-organization .field__item')?.text(),
      structure : $('.field--name-field-structure-type .field__item')?.text(),
      geo_area : $('.field--name-field-geo-area .field__item')?.text(),
      creation : $('.field--name-field-structure-creation .field__item')?.text(),
      since : $('.field--name-field-member-since .field__item')?.text(),
      zip_code : $('.field--name-field-zip-code .field__item')?.text(),
      city : $('.field--name-field-city .field__item')?.text(),
      softwares,
    });

    if(shouldWeSave){

      setTimeout(persistCatalog,2000)
    }
  } catch (e) {
    console.error('e', e);
  }
}

// rundown all the pages
scrapeDataPages();