/** scrapping des livres de la médiathèque de briis **/ // @ts-ignore import https from 'https'; import WriteFile from "./utils"; const axios = require('axios'); const cheerio = require('cheerio'); const url: string = "www.mediatheque-de-briis-sous-forges.net"; let fetching_path: string = "/mediatheque-de-briis-sous-forges.net/opac/recherche/catalogue?node=0&value=0&page="; interface Book { author: string title: string description: string format: string img: string } const books: Book[] = []; // autres pages: // http://www.mediatheque-de-briis-sous-forges.net/mediatheque-de-briis-sous-forges.net/opac/recherche/catalogue?node=0&value=0&page=2 let page_max = 1927 // page_max = 4 const getTables = (html: string): any => { const $ = cheerio.load(html); const tableElements = $( "table.notice" ); return tableElements; }; const getHtml = async (hostname: string, path: string): Promise => new Promise((resolve, reject) => { https .get( { hostname, path, // port:80, method: "GET", }, (res) => { let html = ""; res.on("data", function (chunk) { html += chunk; }); res.on("end", function () { resolve(html); }); } ) .on("error", (error) => { console.error(error); reject(error); }); }); // fetchData(url).then((res: any) => { // const html = res.data; // const $ = cheerio.load(html); // const statsTable :any = $('table.notice'); // console.log('statsTable', statsTable) // statsTable.each(function(){ // let elem:any = this; // let author = $(elem).find('td').eq(2).text(); // // let img = $(this).find('img').attr('src'); // // console.log(elem); // console.log(author); // }); // }) function writeBookScrapping() { WriteFile('books.json', JSON.stringify(books, null, 2)) } // loop on all pages const scrapOnePage = (tables: any) => { tables.each( (_: any, table: any) => { const $ = cheerio.load(table); // console.log('une table') // let author = $().find('a.notice').text(); let text_description = $(table).find('td').eq(1).text(); let boom = text_description.split('\n'); let splitting = boom[1].split('/') let format = splitting[0].split(':') let img_src = $(table).find('td img').attr('src'); console.log(img_src); books.push({ author: boom[0], title: splitting[0], description: splitting[1], format: format[1], img: img_src }) // console.log(cheerio.load(table).html()) } ) } async function main() { for (let page_counter = 1; page_counter <= page_max; page_counter++) { fetching_path = "/mediatheque-de-briis-sous-forges.net/opac/recherche/catalogue?node=0&value=0&page=" + page_counter await getHtml(url, fetching_path) .then(getTables) .then( scrapOnePage ) // .then(() => { // if (page_counter === page_max) { // } // }) .catch((error) => console.log(error)); } writeBookScrapping() } main()