This commit is contained in:
Tykayn 2022-07-11 11:29:09 +02:00 committed by caligulanorris
parent 198e825e5f
commit b02786046d
3 changed files with 3549 additions and 112 deletions

View File

@ -1,4 +1,9 @@
# Data scraping de framalibre
# Data scraping
## du catalogue des chatons.org
node chatons.js
et hop, on obtient un export des caractéristiques des chatons
## de framalibre
configurer main.js pour définir l'ID maximale
# Utilisation
Installer les paquets npm

View File

@ -8,6 +8,7 @@ const pretty = require('pretty');
const fs = require('fs');
const pageLinkList = [];
const pagesChatons = [];
const chatonsCatalog = {
linkList: [],
pages : [],
@ -36,28 +37,27 @@ async function scrapeDataPages() {
if (element.attribs && element.attribs['hreflang']) {
let url = 'https://www.chatons.org' + element.attribs['href'];
console.log('url', url);
console.log('element.getAttribute(\'href\')', url);
// console.log('url', url);
// console.log('element.getAttribute(\'href\')', url);
pageLinkList.push(url);
}
});
chatonsCatalog.linkList = pageLinkList;
console.log('pageLinkList.length', pageLinkList.length)
pageLinkList.forEach((indexPage, urlPage) => {
if (indexPage === 0) {
console.log('pageLinkList.length', pageLinkList.length);
let indexPage = 0;
pageLinkList.forEach((urlPage) => {
if (indexPage < 100) {
let shouldWeSave = indexPage === pageLinkList.length - 1
scrapeDataCatalogcontent(urlPage);
scrapeDataCatalogcontent(urlPage, shouldWeSave);
}
if (indexPage === pageLinkList.length - 1){
persistCatalog();
}
console.log('indexPage', indexPage, pageLinkList.length)
// console.log('indexPage', indexPage, pageLinkList.length)
indexPage++;
});
} catch (e) {
@ -67,6 +67,8 @@ async function scrapeDataPages() {
function persistCatalog() {
chatonsCatalog.pages = pagesChatons;
// console.log('pagesChatons', pagesChatons)
fs.writeFile('output/chatons_links.json', JSON.stringify(chatonsCatalog, null, 2), (err) => {
if (err) {
console.error(err);
@ -77,7 +79,7 @@ function persistCatalog() {
}
async function scrapeDataCatalogcontent(url_page) {
async function scrapeDataCatalogcontent(url_page, shouldWeSave) {
try {
// Fetch HTML of the page we want to scrape
const { data } = await axios.get(url_page);
@ -85,11 +87,37 @@ async function scrapeDataCatalogcontent(url_page) {
// Load HTML we fetched in the previous line
const $ = cheerio.load(data);
chatonsCatalog.pages.push({
name: $('.chatons-public-subtitle').eq(0).innerText,
url : $('.field--name-field-website-url').eq(0)?.innerText,
rss : $('.field--name-field-rss-feed').eq(0)?.innerText,
// logiciels du chatons
let softwares =[]
$('.view-kitten-software ul li a').each((idx, el) => {
// console.log('idx', idx)
// console.log('el', el)
softwares.push({
name: el.textContent,
link: el.attribs['href']
})
})
// console.log('org', $('.field--name-field-structure-organization')?.text());
pagesChatons.push({
name: $('h2.chatons-public-subtitle').eq(0).text().trim(),
url : $('.field--name-field-website-url .field__item')?.text().trim(),
rss : $('.field--name-field-rss-feed .field__item')?.text().trim(),
organization : $('.field--name-field-structure-organization .field__item')?.text(),
structure : $('.field--name-field-structure-type .field__item')?.text(),
geo_area : $('.field--name-field-geo-area .field__item')?.text(),
creation : $('.field--name-field-structure-creation .field__item')?.text(),
since : $('.field--name-field-member-since .field__item')?.text(),
zip_code : $('.field--name-field-zip-code .field__item')?.text(),
city : $('.field--name-field-city .field__item')?.text(),
softwares,
});
if(shouldWeSave){
setTimeout(persistCatalog,2000)
}
} catch (e) {
console.error('e', e);
}

File diff suppressed because it is too large Load Diff