clean content by special chars lines removal, add content word statistics

This commit is contained in:
Tykayn 2023-03-06 00:01:12 +01:00 committed by tykayn
parent bd538bcee8
commit 3985b4ba33
1 changed files with 115 additions and 48 deletions

View File

@ -17,6 +17,7 @@ let headers = []
let tasksObjectsForJsonExport = [] let tasksObjectsForJsonExport = []
let headersByKind = {} let headersByKind = {}
let writeJsonAfterParse = false; let writeJsonAfterParse = false;
writeJsonAfterParse = true;
/************************************************************** /**************************************************************
* fetch the source orgmode file to read its contents * fetch the source orgmode file to read its contents
@ -42,10 +43,16 @@ fs.stat(sourceFilePath, function (err, stat) {
* search elements * search elements
*********************/ *********************/
let stateKeywordList = ['SOMEDAY', 'NEXT', 'TODO', 'CANCELLED', 'DONE', 'WAITING']; let stateKeywordList = ['SOMEDAY', 'NEXT', 'TODO', 'CANCELLED', 'DONE', 'WAITING'];
let dateKeywordList = ['CREATED', 'SCHEDULED', 'DEADLINE', 'CLOSED','Refiled']; let dateKeywordList = ['CREATED', 'SCHEDULED', 'DEADLINE', 'CLOSED', 'Refiled'];
let sectionKeywordList = ['PROPERTIES', 'LOGBOOK', 'END']; let sectionKeywordList = ['PROPERTIES', 'LOGBOOK', 'END'];
let propertiesSection = {}
let logBookSection = {} let propertiesSection = {} // TODO properties listing
let logBookSection = {} // TODO logbook listing
let statistics = {
tags: {},
words: {}
}
let headerKeywordSearch = '[' + stateKeywordList.join('|') + ']' let headerKeywordSearch = '[' + stateKeywordList.join('|') + ']'
/** /**
@ -55,7 +62,7 @@ let headerKeywordSearch = '[' + stateKeywordList.join('|') + ']'
let task = { let task = {
header: "", header: "",
level: "", level: "",
content: "", corpus: "",
state: "", state: "",
tags: [], tags: [],
tagsInherited: [], tagsInherited: [],
@ -63,17 +70,39 @@ let task = {
logbook: {}, logbook: {},
properties: {}, properties: {},
} }
// init first task object as empty clone
let currentTask = {...task};
let isHeader = false; let isHeader = false;
let isProperty = false; let isProperty = false;
let isLogbook = false; let isLogbook = false;
let isFirst = true; let isFirst = true;
// init first task object as empty clone
let currentTask = {...task};
/**
* add to tasks to export and refresh current task
*/
function addAndRefreshCurrentTask() {
tasksObjectsForJsonExport.push(currentTask)
currentTask = {...task};
currentTask.dates = {};
};
function makeWordsStatistics(sentence) {
sentence.split(' ')?.forEach(word => {
if (!statistics.words[word]) {
statistics.words[word] = 0
}
statistics.words[word]++
})
}
/********************** /**********************
* loop to parse all * loop to parse all
*********************/ *********************/
fs.readFile(sourceFilePath, 'utf8', function (err, data) { fs.readFile(sourceFilePath, 'utf8', function (err, data) {
if (err) { if (err) {
return console.log(err); return console.log(err);
} }
@ -90,10 +119,8 @@ fs.readFile(sourceFilePath, 'utf8', function (err, data) {
if (line.match(/^\*+? /)) { if (line.match(/^\*+? /)) {
// add last task to export list // add last task to export list
if (!isFirst) { if (!isFirst) {
tasksObjectsForJsonExport.push(currentTask)
console.log('currentTask.dates', currentTask.dates) addAndRefreshCurrentTask();
currentTask = {...task};
} else { } else {
isFirst = false; isFirst = false;
} }
@ -105,11 +132,9 @@ fs.readFile(sourceFilePath, 'utf8', function (err, data) {
// create a new task // create a new task
line = line.replace('*', '') headers.push(cleanHeader(line))
line = line.replace(stateKeywordList, [].fill('', 0, stateKeywordList.length)) currentTask.header = cleanHeader(line);
makeWordsStatistics(cleanHeader(line));
headers.push(line)
currentTask.header = line;
stateKeywordList.forEach(keyword => { stateKeywordList.forEach(keyword => {
let keywordIsFound = lineHasKeyword(line, keyword) let keywordIsFound = lineHasKeyword(line, keyword)
@ -123,57 +148,72 @@ fs.readFile(sourceFilePath, 'utf8', function (err, data) {
let tagsFound = line.match(/\:(.*)\:/g) let tagsFound = line.match(/\:(.*)\:/g)
if (tagsFound) { if (tagsFound) {
tagsFound = tagsFound[0]; tagsFound = tagsFound[0];
console.log('tagsFound', tagsFound) let tagList = tagsFound.split(':');
tagsFound = tagsFound.split(':').filter(item => item.length) tagList?.forEach(tag => {
currentTask.tags = tagsFound; if (tag.length > 1) {
if (!statistics.tags[tag]) {
statistics.tags[tag] = 0
}
statistics.tags[tag]++
currentTask.tags.push(tag)
}
})
} }
// fin des recherches dans la ligne de Header // ------------- fin des recherches dans la ligne de Header -------------
} else { } else {
isHeader = false; isHeader = false;
} }
// examen des lignes de corps de tâche, ou de corps de section suite au header. // examen des lignes de corps de tâche, ou de corps de section suite au header.
// classer les dates de création, cloture, et de logbook // classer les dates de création, cloture, et de logbook
let dateFound = searchDate(line) let dateFound = searchDate(line)
if(dateFound){ if (dateFound) {
dateKeywordList.forEach(keyword => { dateKeywordList.forEach(keyword => {
if (lineHasSubstring(line, keyword)) { if (lineHasSubstring(line, keyword)) {
if (!currentTask.dates[keyword]) { if (!currentTask.dates[keyword]) {
currentTask.dates[keyword] = ''; currentTask.dates[keyword] = '';
}
currentTask.dates[keyword] = new Date(dateFound[0]);
} else {
// console.log('keyword', keyword)
}
})
} else {
if (line.indexOf(dateKeywordList) !== -1 && line.indexOf(stateKeywordList) !== -1 && line.indexOf(sectionKeywordList) !== -1) {
makeWordsStatistics(line)
// ajouter le corps complet de la section après le header
if (line.length && !isHeader) {
let cleanedLine = line.replace(/\s\s/g, ' ');
cleanedLine = line.replace(/ {2,}/g, ' ')
currentTask.corpus += `${cleanedLine}
`
} }
currentTask.dates[keyword] = new Date(dateFound[0]);
} else {
// console.log('keyword', keyword)
} }
})
} }
// ajouter le corps complet de la section après le header
if (line.length && !isHeader) {
let cleanedLine = line.replace(/\s\s/g, ' ')
cleanedLine = line.replace(/ {2,}/g, ' ')
console.log('line', cleanedLine)
currentTask.corpus += `
` + cleanedLine;
}
}) })
// ajouter la dernière tâche parsée // ajouter la dernière tâche parsée
tasksObjectsForJsonExport.push(currentTask) addAndRefreshCurrentTask();
console.log('headers', headers)
console.log(" parsing fini") console.log(" parsing fini")
stateKeywordList.forEach(keyword => console.log('nombre de headers', keyword, headersByKind[keyword]?.length)) // stateKeywordList.forEach(keyword => console.log('nombre de headers', keyword, headersByKind[keyword]?.length))
const jsonContent = { const jsonContent = {
statistics: { statistics: {
lines_count: everyline.length, lines_count: everyline.length,
headers_count: headers.length, headers_count: headers.length,
statistics: Object.keys(statistics).sort(function (a, b) {
return statistics[a] - statistics[b]
})
}, },
meta_data: { meta_data: {
author: '@tykayn@mastodon.Cipherbliss.com', author: '@tykayn@mastodon.Cipherbliss.com',
@ -183,13 +223,14 @@ fs.readFile(sourceFilePath, 'utf8', function (err, data) {
}, },
tasks_list: tasksObjectsForJsonExport tasks_list: tasksObjectsForJsonExport
} }
console.log('statistics', statistics)
// console.log('tasksObjectsForJsonExport', jsonContent) // console.log('tasksObjectsForJsonExport', jsonContent)
if (writeJsonAfterParse) { if (writeJsonAfterParse) {
writeJsonFile('export_' + sourceFileName + '.json', JSON.stringify(jsonContent)); writeJsonFile('export_' + sourceFileName + '_parsed.json', JSON.stringify(jsonContent));
} }
return;
}) })
function lineHasKeyword(line, keyword = 'TODO') { function lineHasKeyword(line, keyword = 'TODO') {
@ -198,13 +239,22 @@ function lineHasKeyword(line, keyword = 'TODO') {
if (isFound) { if (isFound) {
createNewHeaderKind(keyword) createNewHeaderKind(keyword)
headersByKind[keyword].push(line); headersByKind[keyword].push(line);
if (!statistics[keyword]) {
statistics[keyword] = 0
}
statistics[keyword]++
} }
return isFound; return isFound;
} }
function lineHasSubstring(line, keyword) { function lineHasSubstring(line, keyword) {
let isFound = (line.indexOf(keyword) !== -1)
if (!statistics[keyword]) {
statistics[keyword] = 0
}
statistics[keyword]++
return (line.indexOf(keyword) !== -1) return isFound
} }
function createNewHeaderKind(keyword) { function createNewHeaderKind(keyword) {
@ -226,14 +276,14 @@ function searchDate(line) {
let simpleDayHour = line.match(/\d{4}\-\d{2}\-\d{2} \w{3}?\.? \d{2}\:\d{2}/) let simpleDayHour = line.match(/\d{4}\-\d{2}\-\d{2} \w{3}?\.? \d{2}\:\d{2}/)
let simpleDayHourSec = line.match(/\d{4}\-\d{2}\-\d{2} \w{3}?\.? \d{2}\:\d{2}\:\d{2}/) let simpleDayHourSec = line.match(/\d{4}\-\d{2}\-\d{2} \w{3}?\.? \d{2}\:\d{2}\:\d{2}/)
if(simpleDayHourSec){ if (simpleDayHourSec) {
return simpleDayHourSec; return simpleDayHourSec;
} }
if(simpleDayHour){ if (simpleDayHour) {
return simpleDayHour; return simpleDayHour;
} }
if(simpleDay){ if (simpleDay) {
return simpleDay; return simpleDay;
} }
@ -249,6 +299,23 @@ function compareDatesAndKeepOldest(date1, date2) {
date2 = moment(date2) date2 = moment(date2)
} }
/**
* get the cleaned content of the header
* @param line
*/
function cleanHeader(line) {
line = '' + line;
stateKeywordList.forEach(keyword => {
line = line.replace(keyword, '')
})
line = line.replace(/\** /, '');
line = line.replace(/\[.*\]/g, '');
line = line.replace(/\:.*\:/g, '');
line = line.replace(' ', '');
return line.trim();
}
function writeJsonFile(fileName, fileContent) { function writeJsonFile(fileName, fileContent) {
console.log('write file ', fileName); console.log('write file ', fileName);