mapping-geojson-osm/make_variance_from_csv.ts

194 lines
6.3 KiB
TypeScript
Raw Normal View History

/**
prendre un CSV,
examiner toutes les colonnes et leurs valeurs,
garder en mémoire les valeurs uniques de chaque colonne
faire un nouveau csv qui ne montre que les valeurs uniques pour chacune des colonnes
et qui compte le nombre de valeurs
**/
import utils from './mappings/utils'
import {parse} from 'csv'
const fs = require('fs')
2023-08-11 18:41:32 +02:00
const minimist = require('minimist')
2023-08-18 11:25:02 +02:00
let mini_arguments: any = minimist(process.argv.slice(2))
2023-08-18 12:59:09 +02:00
// interface VarianceType {
// [key: string]: Array<string>
// }
let csv_content = 'variance de dataset\n';
2023-09-01 11:12:54 +02:00
let separator = '\t';
2023-08-31 22:07:57 +02:00
let separator_fallback = ';';
2023-08-18 12:59:09 +02:00
let data_variance: any = {};
2023-08-31 22:07:57 +02:00
let folder = 'toilettes'
let inputFile = 'small.csv'
let fileNameOutput = `variance_${inputFile}`;
let inputPath = `./etalab_data/${folder}/${inputFile}`
2023-08-18 12:59:48 +02:00
// let inputPath = './etalab_data/toilettes/small_datas.csv'
2023-08-31 22:07:57 +02:00
let outputPath = `etalab_data/${folder}`
2023-09-01 11:12:54 +02:00
console.log('mini_arguments', mini_arguments)
if (mini_arguments['inputFile']) {
inputFile = `${mini_arguments['input-file']}`
}
2023-08-18 11:25:02 +02:00
if (mini_arguments['source']) {
inputPath = mini_arguments['source']
}
2023-08-31 22:07:57 +02:00
if (mini_arguments['separator']) {
separator = mini_arguments['separator']
}
let columns_headings: Array<string> = [];
let lines_count = 0;
let longest_variance_count = 0;
2023-08-18 12:59:09 +02:00
function getColumnsFromRow(row: string) {
let headings: any = []
console.log('elem', row)
2023-08-31 22:07:57 +02:00
2023-08-18 12:59:09 +02:00
headings = row.split(separator)
return headings
}
2023-08-18 13:25:58 +02:00
/**
* sort unique values ascending in each column
*/
function reorderValuesInDataVariance() {
columns_headings.forEach((heading: string) => {
data_variance[heading] = data_variance[heading].sort((a: any, b: any) => a - b)
})
}
2023-09-01 11:12:54 +02:00
console.log('read file ', inputPath)
fs.readFile(inputPath, function (err: any, fileData: any) {
if (err) {
throw new Error(err)
} else {
2023-08-18 12:59:09 +02:00
parse(fileData, {columns: false, trim: true}, function (err: any, lines: any) {
// Your CSV data is in an array of arrays passed to this callback as rows.
2023-08-18 12:59:09 +02:00
if (err) {
2023-08-11 18:41:32 +02:00
throw new Error(err)
}
console.log('line ', lines_count)
2023-08-31 22:07:57 +02:00
console.log('lines.length', lines.length)
2023-08-18 12:59:09 +02:00
lines.forEach((line: any) => {
2023-08-18 12:59:09 +02:00
line = line[0]
if (lines_count === 0) {
2023-08-31 22:07:57 +02:00
console.log('line', line)
2023-09-01 11:12:54 +02:00
if (line.indexOf(separator) === -1) {
2023-08-31 22:07:57 +02:00
console.log('separator not found: ', separator, 'trying other separator')
2023-09-01 11:12:54 +02:00
if (line.indexOf(separator_fallback) === -1) {
2023-08-31 22:07:57 +02:00
console.log('separator not found: ', separator_fallback)
throw new Error('no separator found in csv')
2023-09-01 11:12:54 +02:00
} else {
2023-08-31 22:07:57 +02:00
console.log('fallback separator found: ', separator_fallback, 'now using it ')
separator = separator_fallback
}
}
2023-08-18 12:59:09 +02:00
columns_headings = getColumnsFromRow(line)
console.log('columns_headings.length', columns_headings.length)
2023-08-18 12:59:09 +02:00
// console.log('columns_headings', columns_headings)
let headers = Object.keys(columns_headings)
columns_headings.forEach((header: string) => {
data_variance[header] = []
})
console.log('data_variance', data_variance)
} else {
// lignes suivantes
2023-08-18 12:59:09 +02:00
let column_index = 0
2023-08-18 12:59:09 +02:00
line.split(separator).forEach((value: string) => {
value = value.trim()
2023-08-18 12:59:09 +02:00
let column_header_current = columns_headings[column_index]
// console.log('column_index', column_index)
// dans chaque colonne, vérifier que la valeur n'est pas déjà présente
// dans les index de variance
// si la valeur est nouvelle, l'ajouter
2023-08-18 12:59:09 +02:00
if (data_variance[column_header_current].indexOf(value) === -1) {
data_variance[column_header_current].push(value)
if (
2023-08-18 12:59:09 +02:00
data_variance[column_header_current].length > longest_variance_count
) {
2023-08-18 12:59:09 +02:00
longest_variance_count = data_variance[column_header_current].length
}
2023-08-18 13:25:58 +02:00
} else {
console.log('value', value, ' déjà présente dans la collection', column_header_current)
}
2023-08-18 13:25:58 +02:00
column_index++
2023-08-18 12:59:09 +02:00
})
2023-08-18 12:59:09 +02:00
}
2023-08-18 12:59:09 +02:00
lines_count++
})
2023-08-18 12:59:09 +02:00
console.log('lines_count', lines_count)
console.log('longest_variance_count', longest_variance_count)
2023-08-18 13:25:58 +02:00
reorderValuesInDataVariance()
2023-08-31 22:07:57 +02:00
utils.writeFile(fileNameOutput, writeCSVVariance(), outputPath)
})
}
console.log('parsing done')
// console.log('data_variance', data_variance)
})
/**
* écrit un csv avec les données de variance du dataset donné
*/
function writeCSVVariance() {
2023-08-18 13:25:58 +02:00
let csv_content = `;variance de ${inputPath};généré le:;${new Date()};lignes du csv original:;${lines_count};fait avec make_variance_from_csv.ts de Tykayn
`
let columns = Object.keys(data_variance);
// add headings
columns_headings.forEach((heading: string) => {
csv_content = csv_content + separator + heading
})
csv_content = csv_content + '\n'
// add max length of variance for each column
let ii = 0
columns.forEach((column: string) => {
// console.log('column', column, data_variance[column].length)
csv_content = csv_content + separator + data_variance[column].length
ii++
})
csv_content = csv_content + '\n\n'
// add content of values
for (let ii = 0; ii < longest_variance_count; ii++) {
csv_content = csv_content + '\n'
columns.forEach((column: any) => {
if (ii < data_variance[column].length) {
let currentValue = data_variance[column][ii]
csv_content = csv_content + separator + currentValue
} else {
csv_content = csv_content + separator
}
})
}
return csv_content;
}