scripts/scraping/main.ts

130 lines
3.5 KiB
TypeScript
Raw Normal View History

2023-08-17 12:40:49 +02:00
/**
scrapping des livres de la médiathèque de briis
**/
// @ts-ignore
import https from 'https';
import WriteFile from "./utils";
const axios = require('axios');
const cheerio = require('cheerio');
const url: string = "www.mediatheque-de-briis-sous-forges.net";
2023-08-17 12:53:32 +02:00
let fetching_path: string = "/mediatheque-de-briis-sous-forges.net/opac/recherche/catalogue?node=0&value=0&page=";
2023-08-17 12:40:49 +02:00
interface Book {
author: string
title: string
description: string
2023-08-17 12:53:32 +02:00
format: string
2023-08-17 12:40:49 +02:00
img: string
}
2023-08-17 12:53:32 +02:00
2023-08-17 12:40:49 +02:00
const books: Book[] = [];
// autres pages:
// http://www.mediatheque-de-briis-sous-forges.net/mediatheque-de-briis-sous-forges.net/opac/recherche/catalogue?node=0&value=0&page=2
2023-08-17 12:53:32 +02:00
let page_max = 1927
// page_max = 4
2023-08-17 12:40:49 +02:00
const getTables = (html: string): any => {
const $ = cheerio.load(html);
const tableElements = $(
"table.notice"
);
return tableElements;
};
const getHtml = async (hostname: string, path: string): Promise<string> =>
new Promise((resolve, reject) => {
https
.get(
{
hostname,
path,
// port:80,
method: "GET",
},
(res) => {
let html = "";
res.on("data", function (chunk) {
html += chunk;
});
res.on("end", function () {
resolve(html);
});
}
)
.on("error", (error) => {
console.error(error);
reject(error);
});
});
// fetchData(url).then((res: any) => {
// const html = res.data;
// const $ = cheerio.load(html);
// const statsTable :any = $('table.notice');
// console.log('statsTable', statsTable)
// statsTable.each(function(){
// let elem:any = this;
// let author = $(elem).find('td').eq(2).text();
// // let img = $(this).find('img').attr('src');
// // console.log(elem);
// console.log(author);
// });
// })
function writeBookScrapping() {
2023-08-17 12:53:32 +02:00
WriteFile('books.json', JSON.stringify(books, null, 2))
2023-08-17 12:40:49 +02:00
}
2023-08-17 12:53:32 +02:00
// loop on all pages
const scrapOnePage = (tables: any) => {
tables.each(
(_: any, table: any) => {
const $ = cheerio.load(table);
// console.log('une table')
// let author = $().find('a.notice').text();
let text_description = $(table).find('td').eq(1).text();
let boom = text_description.split('\n');
let splitting = boom[1].split('/')
let format = splitting[0].split(':')
let img_src = $(table).find('td img').attr('src');
console.log(img_src);
books.push({
author: boom[0],
title: splitting[0],
description: splitting[1],
format: format[1],
2023-08-17 12:53:32 +02:00
img: img_src
})
// console.log(cheerio.load(table).html())
}
2023-08-17 12:40:49 +02:00
)
2023-08-17 12:53:32 +02:00
}
async function main() {
for (let page_counter = 1; page_counter <= page_max; page_counter++) {
fetching_path = "/mediatheque-de-briis-sous-forges.net/opac/recherche/catalogue?node=0&value=0&page=" + page_counter
await getHtml(url, fetching_path)
.then(getTables)
.then(
scrapOnePage
)
// .then(() => {
// if (page_counter === page_max) {
// }
// })
.catch((error) => console.log(error));
}
writeBookScrapping()
}
main()