2023-08-09 23:10:45 +02:00
|
|
|
/**
|
|
|
|
prendre un CSV,
|
|
|
|
examiner toutes les colonnes et leurs valeurs,
|
|
|
|
garder en mémoire les valeurs uniques de chaque colonne
|
|
|
|
faire un nouveau csv qui ne montre que les valeurs uniques pour chacune des colonnes
|
|
|
|
et qui compte le nombre de valeurs
|
|
|
|
**/
|
|
|
|
import utils from './mappings/utils'
|
|
|
|
import {parse} from 'csv'
|
|
|
|
|
|
|
|
const fs = require('fs')
|
2023-08-11 18:41:32 +02:00
|
|
|
const minimist = require('minimist')
|
2023-08-09 23:10:45 +02:00
|
|
|
|
2023-08-18 11:25:02 +02:00
|
|
|
let mini_arguments: any = minimist(process.argv.slice(2))
|
|
|
|
|
2023-08-18 12:59:09 +02:00
|
|
|
// interface VarianceType {
|
|
|
|
// [key: string]: Array<string>
|
|
|
|
// }
|
2023-08-09 23:10:45 +02:00
|
|
|
|
|
|
|
let csv_content = 'variance de dataset\n';
|
2023-08-30 11:19:06 +02:00
|
|
|
let separator = ',';
|
|
|
|
let separator_fallback = ';';
|
2023-08-18 12:59:09 +02:00
|
|
|
let data_variance: any = {};
|
2023-08-09 23:10:45 +02:00
|
|
|
|
2023-08-30 11:19:06 +02:00
|
|
|
let folder = 'toilettes'
|
|
|
|
let inputFile = 'small.csv'
|
|
|
|
let fileNameOutput = `variance_${inputFile}`;
|
|
|
|
let inputPath = `./etalab_data/${folder}/${inputFile}`
|
2023-08-18 12:59:48 +02:00
|
|
|
// let inputPath = './etalab_data/toilettes/small_datas.csv'
|
2023-08-30 11:19:06 +02:00
|
|
|
let outputPath = `etalab_data/${folder}`
|
2023-08-18 11:25:02 +02:00
|
|
|
if (mini_arguments['source']) {
|
|
|
|
inputPath = mini_arguments['source']
|
|
|
|
}
|
2023-08-30 11:19:06 +02:00
|
|
|
if (mini_arguments['separator']) {
|
|
|
|
separator = mini_arguments['separator']
|
|
|
|
}
|
2023-08-09 23:10:45 +02:00
|
|
|
let columns_headings: Array<string> = [];
|
|
|
|
let lines_count = 0;
|
|
|
|
let longest_variance_count = 0;
|
|
|
|
|
2023-08-18 12:59:09 +02:00
|
|
|
|
|
|
|
function getColumnsFromRow(row: string) {
|
|
|
|
let headings: any = []
|
|
|
|
console.log('elem', row)
|
2023-08-30 11:19:06 +02:00
|
|
|
|
2023-08-18 12:59:09 +02:00
|
|
|
headings = row.split(separator)
|
|
|
|
return headings
|
|
|
|
}
|
|
|
|
|
2023-08-09 23:10:45 +02:00
|
|
|
console.log('open file ', inputPath)
|
2023-08-18 13:25:58 +02:00
|
|
|
|
|
|
|
/**
|
|
|
|
* sort unique values ascending in each column
|
|
|
|
*/
|
|
|
|
function reorderValuesInDataVariance() {
|
|
|
|
columns_headings.forEach((heading: string) => {
|
|
|
|
|
|
|
|
data_variance[heading] = data_variance[heading].sort((a: any, b: any) => a - b)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2023-08-09 23:10:45 +02:00
|
|
|
fs.readFile(inputPath, function (err: any, fileData: any) {
|
|
|
|
|
|
|
|
if (err) {
|
|
|
|
throw new Error(err)
|
|
|
|
} else {
|
2023-08-18 12:59:09 +02:00
|
|
|
parse(fileData, {columns: false, trim: true}, function (err: any, lines: any) {
|
2023-08-09 23:10:45 +02:00
|
|
|
// Your CSV data is in an array of arrays passed to this callback as rows.
|
|
|
|
|
2023-08-18 12:59:09 +02:00
|
|
|
if (err) {
|
2023-08-11 18:41:32 +02:00
|
|
|
throw new Error(err)
|
|
|
|
}
|
2023-08-09 23:10:45 +02:00
|
|
|
console.log('line ', lines_count)
|
2023-08-30 11:19:06 +02:00
|
|
|
console.log('lines.length', lines.length)
|
2023-08-09 23:10:45 +02:00
|
|
|
|
2023-08-18 12:59:09 +02:00
|
|
|
lines.forEach((line: any) => {
|
2023-08-09 23:10:45 +02:00
|
|
|
|
2023-08-18 12:59:09 +02:00
|
|
|
line = line[0]
|
2023-08-09 23:10:45 +02:00
|
|
|
|
|
|
|
if (lines_count === 0) {
|
2023-08-30 11:19:06 +02:00
|
|
|
console.log('line', line)
|
|
|
|
if(line.indexOf(separator) === -1){
|
|
|
|
console.log('separator not found: ', separator, 'trying other separator')
|
|
|
|
if(line.indexOf(separator_fallback) === -1){
|
|
|
|
console.log('separator not found: ', separator_fallback)
|
|
|
|
|
|
|
|
throw new Error('no separator found in csv')
|
|
|
|
|
|
|
|
}else{
|
|
|
|
console.log('fallback separator found: ', separator_fallback, 'now using it ')
|
|
|
|
separator = separator_fallback
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-08-18 12:59:09 +02:00
|
|
|
columns_headings = getColumnsFromRow(line)
|
2023-08-09 23:10:45 +02:00
|
|
|
console.log('columns_headings.length', columns_headings.length)
|
2023-08-18 12:59:09 +02:00
|
|
|
// console.log('columns_headings', columns_headings)
|
|
|
|
let headers = Object.keys(columns_headings)
|
|
|
|
columns_headings.forEach((header: string) => {
|
|
|
|
data_variance[header] = []
|
|
|
|
})
|
|
|
|
console.log('data_variance', data_variance)
|
2023-08-09 23:10:45 +02:00
|
|
|
} else {
|
|
|
|
// lignes suivantes
|
|
|
|
|
2023-08-18 12:59:09 +02:00
|
|
|
let column_index = 0
|
2023-08-09 23:10:45 +02:00
|
|
|
|
|
|
|
|
2023-08-18 12:59:09 +02:00
|
|
|
line.split(separator).forEach((value: string) => {
|
2023-08-09 23:10:45 +02:00
|
|
|
value = value.trim()
|
2023-08-18 12:59:09 +02:00
|
|
|
let column_header_current = columns_headings[column_index]
|
|
|
|
// console.log('column_index', column_index)
|
|
|
|
// dans chaque colonne, vérifier que la valeur n'est pas déjà présente
|
|
|
|
// dans les index de variance
|
2023-08-09 23:10:45 +02:00
|
|
|
// si la valeur est nouvelle, l'ajouter
|
2023-08-18 12:59:09 +02:00
|
|
|
|
|
|
|
if (data_variance[column_header_current].indexOf(value) === -1) {
|
|
|
|
data_variance[column_header_current].push(value)
|
2023-08-09 23:10:45 +02:00
|
|
|
if (
|
2023-08-18 12:59:09 +02:00
|
|
|
data_variance[column_header_current].length > longest_variance_count
|
2023-08-09 23:10:45 +02:00
|
|
|
) {
|
2023-08-18 12:59:09 +02:00
|
|
|
longest_variance_count = data_variance[column_header_current].length
|
2023-08-09 23:10:45 +02:00
|
|
|
}
|
|
|
|
|
2023-08-18 13:25:58 +02:00
|
|
|
} else {
|
|
|
|
console.log('value', value, ' déjà présente dans la collection', column_header_current)
|
2023-08-09 23:10:45 +02:00
|
|
|
}
|
2023-08-18 13:25:58 +02:00
|
|
|
|
2023-08-09 23:10:45 +02:00
|
|
|
column_index++
|
2023-08-18 12:59:09 +02:00
|
|
|
|
2023-08-09 23:10:45 +02:00
|
|
|
})
|
2023-08-18 12:59:09 +02:00
|
|
|
|
2023-08-09 23:10:45 +02:00
|
|
|
}
|
2023-08-18 12:59:09 +02:00
|
|
|
|
|
|
|
lines_count++
|
2023-08-09 23:10:45 +02:00
|
|
|
})
|
|
|
|
|
2023-08-18 12:59:09 +02:00
|
|
|
console.log('lines_count', lines_count)
|
2023-08-09 23:10:45 +02:00
|
|
|
console.log('longest_variance_count', longest_variance_count)
|
|
|
|
|
2023-08-18 13:25:58 +02:00
|
|
|
reorderValuesInDataVariance()
|
2023-08-30 11:19:06 +02:00
|
|
|
utils.writeFile(fileNameOutput, writeCSVVariance(), outputPath)
|
2023-08-09 23:10:45 +02:00
|
|
|
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
console.log('parsing done')
|
|
|
|
// console.log('data_variance', data_variance)
|
|
|
|
})
|
|
|
|
|
|
|
|
/**
|
|
|
|
* écrit un csv avec les données de variance du dataset donné
|
|
|
|
*/
|
|
|
|
function writeCSVVariance() {
|
|
|
|
|
2023-08-18 13:25:58 +02:00
|
|
|
let csv_content = `;variance de ${inputPath};généré le:;${new Date()};lignes du csv original:;${lines_count};fait avec make_variance_from_csv.ts de Tykayn
|
|
|
|
`
|
2023-08-09 23:10:45 +02:00
|
|
|
let columns = Object.keys(data_variance);
|
|
|
|
|
|
|
|
// add headings
|
|
|
|
columns_headings.forEach((heading: string) => {
|
|
|
|
csv_content = csv_content + separator + heading
|
|
|
|
})
|
|
|
|
csv_content = csv_content + '\n'
|
|
|
|
// add max length of variance for each column
|
|
|
|
let ii = 0
|
|
|
|
columns.forEach((column: string) => {
|
|
|
|
// console.log('column', column, data_variance[column].length)
|
|
|
|
csv_content = csv_content + separator + data_variance[column].length
|
|
|
|
ii++
|
|
|
|
})
|
|
|
|
|
|
|
|
csv_content = csv_content + '\n\n'
|
|
|
|
// add content of values
|
|
|
|
for (let ii = 0; ii < longest_variance_count; ii++) {
|
|
|
|
csv_content = csv_content + '\n'
|
|
|
|
columns.forEach((column: any) => {
|
|
|
|
if (ii < data_variance[column].length) {
|
|
|
|
|
|
|
|
let currentValue = data_variance[column][ii]
|
|
|
|
csv_content = csv_content + separator + currentValue
|
|
|
|
} else {
|
|
|
|
csv_content = csv_content + separator
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
return csv_content;
|
|
|
|
}
|