hop
This commit is contained in:
parent
198e825e5f
commit
b02786046d
@ -1,4 +1,9 @@
|
||||
# Data scraping de framalibre
|
||||
# Data scraping
|
||||
## du catalogue des chatons.org
|
||||
node chatons.js
|
||||
|
||||
et hop, on obtient un export des caractéristiques des chatons
|
||||
## de framalibre
|
||||
configurer main.js pour définir l'ID maximale
|
||||
# Utilisation
|
||||
Installer les paquets npm
|
||||
|
60
chatons.js
60
chatons.js
@ -8,6 +8,7 @@ const pretty = require('pretty');
|
||||
const fs = require('fs');
|
||||
|
||||
const pageLinkList = [];
|
||||
const pagesChatons = [];
|
||||
const chatonsCatalog = {
|
||||
linkList: [],
|
||||
pages : [],
|
||||
@ -36,28 +37,27 @@ async function scrapeDataPages() {
|
||||
if (element.attribs && element.attribs['hreflang']) {
|
||||
|
||||
let url = 'https://www.chatons.org' + element.attribs['href'];
|
||||
console.log('url', url);
|
||||
console.log('element.getAttribute(\'href\')', url);
|
||||
// console.log('url', url);
|
||||
// console.log('element.getAttribute(\'href\')', url);
|
||||
pageLinkList.push(url);
|
||||
}
|
||||
});
|
||||
|
||||
chatonsCatalog.linkList = pageLinkList;
|
||||
|
||||
console.log('pageLinkList.length', pageLinkList.length)
|
||||
pageLinkList.forEach((indexPage, urlPage) => {
|
||||
if (indexPage === 0) {
|
||||
console.log('pageLinkList.length', pageLinkList.length);
|
||||
let indexPage = 0;
|
||||
pageLinkList.forEach((urlPage) => {
|
||||
if (indexPage < 100) {
|
||||
let shouldWeSave = indexPage === pageLinkList.length - 1
|
||||
|
||||
scrapeDataCatalogcontent(urlPage);
|
||||
scrapeDataCatalogcontent(urlPage, shouldWeSave);
|
||||
|
||||
}
|
||||
|
||||
if (indexPage === pageLinkList.length - 1){
|
||||
|
||||
persistCatalog();
|
||||
|
||||
}
|
||||
console.log('indexPage', indexPage, pageLinkList.length)
|
||||
// console.log('indexPage', indexPage, pageLinkList.length)
|
||||
indexPage++;
|
||||
});
|
||||
|
||||
} catch (e) {
|
||||
@ -67,6 +67,8 @@ async function scrapeDataPages() {
|
||||
|
||||
function persistCatalog() {
|
||||
|
||||
chatonsCatalog.pages = pagesChatons;
|
||||
// console.log('pagesChatons', pagesChatons)
|
||||
fs.writeFile('output/chatons_links.json', JSON.stringify(chatonsCatalog, null, 2), (err) => {
|
||||
if (err) {
|
||||
console.error(err);
|
||||
@ -77,7 +79,7 @@ function persistCatalog() {
|
||||
|
||||
}
|
||||
|
||||
async function scrapeDataCatalogcontent(url_page) {
|
||||
async function scrapeDataCatalogcontent(url_page, shouldWeSave) {
|
||||
try {
|
||||
// Fetch HTML of the page we want to scrape
|
||||
const { data } = await axios.get(url_page);
|
||||
@ -85,11 +87,37 @@ async function scrapeDataCatalogcontent(url_page) {
|
||||
// Load HTML we fetched in the previous line
|
||||
const $ = cheerio.load(data);
|
||||
|
||||
chatonsCatalog.pages.push({
|
||||
name: $('.chatons-public-subtitle').eq(0).innerText,
|
||||
url : $('.field--name-field-website-url').eq(0)?.innerText,
|
||||
rss : $('.field--name-field-rss-feed').eq(0)?.innerText,
|
||||
|
||||
// logiciels du chatons
|
||||
|
||||
let softwares =[]
|
||||
$('.view-kitten-software ul li a').each((idx, el) => {
|
||||
// console.log('idx', idx)
|
||||
// console.log('el', el)
|
||||
softwares.push({
|
||||
name: el.textContent,
|
||||
link: el.attribs['href']
|
||||
})
|
||||
})
|
||||
// console.log('org', $('.field--name-field-structure-organization')?.text());
|
||||
pagesChatons.push({
|
||||
name: $('h2.chatons-public-subtitle').eq(0).text().trim(),
|
||||
url : $('.field--name-field-website-url .field__item')?.text().trim(),
|
||||
rss : $('.field--name-field-rss-feed .field__item')?.text().trim(),
|
||||
organization : $('.field--name-field-structure-organization .field__item')?.text(),
|
||||
structure : $('.field--name-field-structure-type .field__item')?.text(),
|
||||
geo_area : $('.field--name-field-geo-area .field__item')?.text(),
|
||||
creation : $('.field--name-field-structure-creation .field__item')?.text(),
|
||||
since : $('.field--name-field-member-since .field__item')?.text(),
|
||||
zip_code : $('.field--name-field-zip-code .field__item')?.text(),
|
||||
city : $('.field--name-field-city .field__item')?.text(),
|
||||
softwares,
|
||||
});
|
||||
|
||||
if(shouldWeSave){
|
||||
|
||||
setTimeout(persistCatalog,2000)
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('e', e);
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user