130 lines
3.5 KiB
TypeScript
130 lines
3.5 KiB
TypeScript
/**
|
|
scrapping des livres de la médiathèque de briis
|
|
**/
|
|
// @ts-ignore
|
|
import https from 'https';
|
|
import WriteFile from "./utils";
|
|
|
|
const axios = require('axios');
|
|
const cheerio = require('cheerio');
|
|
const url: string = "www.mediatheque-de-briis-sous-forges.net";
|
|
let fetching_path: string = "/mediatheque-de-briis-sous-forges.net/opac/recherche/catalogue?node=0&value=0&page=";
|
|
|
|
interface Book {
|
|
author: string
|
|
title: string
|
|
description: string
|
|
format: string
|
|
img: string
|
|
}
|
|
|
|
const books: Book[] = [];
|
|
// autres pages:
|
|
// http://www.mediatheque-de-briis-sous-forges.net/mediatheque-de-briis-sous-forges.net/opac/recherche/catalogue?node=0&value=0&page=2
|
|
let page_max = 1927
|
|
// page_max = 4
|
|
|
|
const getTables = (html: string): any => {
|
|
const $ = cheerio.load(html);
|
|
const tableElements = $(
|
|
"table.notice"
|
|
);
|
|
return tableElements;
|
|
};
|
|
|
|
const getHtml = async (hostname: string, path: string): Promise<string> =>
|
|
new Promise((resolve, reject) => {
|
|
https
|
|
.get(
|
|
{
|
|
hostname,
|
|
path,
|
|
// port:80,
|
|
method: "GET",
|
|
},
|
|
(res) => {
|
|
let html = "";
|
|
res.on("data", function (chunk) {
|
|
html += chunk;
|
|
});
|
|
res.on("end", function () {
|
|
resolve(html);
|
|
});
|
|
}
|
|
)
|
|
.on("error", (error) => {
|
|
console.error(error);
|
|
reject(error);
|
|
});
|
|
});
|
|
|
|
// fetchData(url).then((res: any) => {
|
|
// const html = res.data;
|
|
// const $ = cheerio.load(html);
|
|
// const statsTable :any = $('table.notice');
|
|
// console.log('statsTable', statsTable)
|
|
// statsTable.each(function(){
|
|
// let elem:any = this;
|
|
// let author = $(elem).find('td').eq(2).text();
|
|
// // let img = $(this).find('img').attr('src');
|
|
// // console.log(elem);
|
|
// console.log(author);
|
|
// });
|
|
// })
|
|
|
|
|
|
function writeBookScrapping() {
|
|
WriteFile('books.json', JSON.stringify(books, null, 2))
|
|
}
|
|
|
|
|
|
// loop on all pages
|
|
|
|
const scrapOnePage = (tables: any) => {
|
|
tables.each(
|
|
(_: any, table: any) => {
|
|
const $ = cheerio.load(table);
|
|
// console.log('une table')
|
|
// let author = $().find('a.notice').text();
|
|
let text_description = $(table).find('td').eq(1).text();
|
|
let boom = text_description.split('\n');
|
|
let splitting = boom[1].split('/')
|
|
let format = splitting[0].split(':')
|
|
let img_src = $(table).find('td img').attr('src');
|
|
console.log(img_src);
|
|
books.push({
|
|
author: boom[0],
|
|
title: splitting[0],
|
|
description: splitting[1],
|
|
format: format[1],
|
|
img: img_src
|
|
})
|
|
// console.log(cheerio.load(table).html())
|
|
}
|
|
)
|
|
}
|
|
|
|
async function main() {
|
|
|
|
for (let page_counter = 1; page_counter <= page_max; page_counter++) {
|
|
|
|
fetching_path = "/mediatheque-de-briis-sous-forges.net/opac/recherche/catalogue?node=0&value=0&page=" + page_counter
|
|
await getHtml(url, fetching_path)
|
|
.then(getTables)
|
|
.then(
|
|
scrapOnePage
|
|
)
|
|
// .then(() => {
|
|
// if (page_counter === page_max) {
|
|
// }
|
|
// })
|
|
.catch((error) => console.log(error));
|
|
|
|
}
|
|
writeBookScrapping()
|
|
}
|
|
|
|
main()
|
|
|
|
|