100 lines
2.5 KiB
JavaScript
100 lines
2.5 KiB
JavaScript
|
// récupérer des infos sur tous les chatons de chatons.org
|
||
|
// https://www.chatons.org/chatons/all
|
||
|
const url = 'https://www.chatons.org/chatons/all';
|
||
|
|
||
|
const axios = require('axios');
|
||
|
const cheerio = require('cheerio');
|
||
|
const pretty = require('pretty');
|
||
|
const fs = require('fs');
|
||
|
|
||
|
const pageLinkList = [];
|
||
|
const chatonsCatalog = {
|
||
|
linkList: [],
|
||
|
pages : [],
|
||
|
};
|
||
|
|
||
|
// récupérer les liens de chaque chaton, exemple
|
||
|
// https://www.chatons.org/chatons/bastet-parinux
|
||
|
//
|
||
|
//
|
||
|
// infos dans la div:
|
||
|
// .chatons-public-column
|
||
|
// .chatons-public-subtitle nom
|
||
|
|
||
|
// Async function which scrapes the data
|
||
|
async function scrapeDataPages() {
|
||
|
try {
|
||
|
// Fetch HTML of the page we want to scrape
|
||
|
const { data } = await axios.get(url);
|
||
|
// Load HTML we fetched in the previous line
|
||
|
const $ = cheerio.load(data);
|
||
|
|
||
|
const listItems = $('.view-content a ');
|
||
|
|
||
|
listItems.each((index, element) => {
|
||
|
// console.log('element', element)
|
||
|
if (element.attribs && element.attribs['hreflang']) {
|
||
|
|
||
|
let url = 'https://www.chatons.org' + element.attribs['href'];
|
||
|
console.log('url', url);
|
||
|
console.log('element.getAttribute(\'href\')', url);
|
||
|
pageLinkList.push(url);
|
||
|
}
|
||
|
});
|
||
|
|
||
|
chatonsCatalog.linkList = pageLinkList;
|
||
|
|
||
|
console.log('pageLinkList.length', pageLinkList.length)
|
||
|
pageLinkList.forEach((indexPage, urlPage) => {
|
||
|
if (indexPage === 0) {
|
||
|
|
||
|
scrapeDataCatalogcontent(urlPage);
|
||
|
|
||
|
}
|
||
|
|
||
|
if (indexPage === pageLinkList.length - 1){
|
||
|
|
||
|
persistCatalog();
|
||
|
|
||
|
}
|
||
|
console.log('indexPage', indexPage, pageLinkList.length)
|
||
|
});
|
||
|
|
||
|
} catch (e) {
|
||
|
console.error('e', e);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
function persistCatalog() {
|
||
|
|
||
|
fs.writeFile('output/chatons_links.json', JSON.stringify(chatonsCatalog, null, 2), (err) => {
|
||
|
if (err) {
|
||
|
console.error(err);
|
||
|
return;
|
||
|
}
|
||
|
console.log('Successfully written data to file chatons_links.json');
|
||
|
});
|
||
|
|
||
|
}
|
||
|
|
||
|
async function scrapeDataCatalogcontent(url_page) {
|
||
|
try {
|
||
|
// Fetch HTML of the page we want to scrape
|
||
|
const { data } = await axios.get(url_page);
|
||
|
console.log('fetching url_page', url_page);
|
||
|
// Load HTML we fetched in the previous line
|
||
|
const $ = cheerio.load(data);
|
||
|
|
||
|
chatonsCatalog.pages.push({
|
||
|
name: $('.chatons-public-subtitle').eq(0).innerText,
|
||
|
url : $('.field--name-field-website-url').eq(0)?.innerText,
|
||
|
rss : $('.field--name-field-rss-feed').eq(0)?.innerText,
|
||
|
});
|
||
|
} catch (e) {
|
||
|
console.error('e', e);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// rundown all the pages
|
||
|
scrapeDataPages();
|