135 lines
4.3 KiB
TypeScript
135 lines
4.3 KiB
TypeScript
/**
|
|
prendre un CSV,
|
|
examiner toutes les colonnes et leurs valeurs,
|
|
garder en mémoire les valeurs uniques de chaque colonne
|
|
faire un nouveau csv qui ne montre que les valeurs uniques pour chacune des colonnes
|
|
et qui compte le nombre de valeurs
|
|
**/
|
|
import utils from './mappings/utils'
|
|
import {parse} from 'csv'
|
|
|
|
const fs = require('fs')
|
|
const minimist = require('minimist')
|
|
|
|
let mini_arguments: any = minimist(process.argv.slice(2))
|
|
|
|
interface VarianceType {
|
|
[key: string]: Array<string>
|
|
}
|
|
|
|
let csv_content = 'variance de dataset\n';
|
|
let separator = ';';
|
|
let data_variance: VarianceType = {};
|
|
|
|
// const inputPath = './etalab_data/toilettes/sanisettesparis_reworked.csv'
|
|
const inputPath = './etalab_data/small.csv'
|
|
if (mini_arguments['source']) {
|
|
inputPath = mini_arguments['source']
|
|
}
|
|
let columns_headings: Array<string> = [];
|
|
let lines_count = 0;
|
|
let longest_variance_count = 0;
|
|
|
|
console.log('open file ', inputPath)
|
|
fs.readFile(inputPath, function (err: any, fileData: any) {
|
|
|
|
if (err) {
|
|
throw new Error(err)
|
|
} else {
|
|
parse(fileData, {columns: false, trim: true}, function (err: any, rows: any) {
|
|
// Your CSV data is in an array of arrays passed to this callback as rows.
|
|
|
|
if(err){
|
|
throw new Error(err)
|
|
}
|
|
console.log('line ', lines_count)
|
|
|
|
console.log('rows', rows)
|
|
rows.forEach((row: Array<any>) => {
|
|
|
|
|
|
if (lines_count === 0) {
|
|
// console.log('elem', row)
|
|
row.forEach((value: string) => {
|
|
// console.log('value', value)
|
|
columns_headings.push(value)
|
|
data_variance[value] = []
|
|
})
|
|
console.log('columns_headings.length', columns_headings.length)
|
|
lines_count++
|
|
} else {
|
|
// lignes suivantes
|
|
|
|
let column_index = 0;
|
|
|
|
|
|
row.forEach((value: string) => {
|
|
value = value.trim()
|
|
// dans chaque colonne, vérifier que la valeur n'est pas déjà présente dans les index de variance
|
|
// si la valeur est nouvelle, l'ajouter
|
|
if (data_variance[columns_headings[column_index]].indexOf(value) < 0) {
|
|
data_variance[columns_headings[column_index]].push(value)
|
|
if (
|
|
data_variance[columns_headings[column_index]].length > longest_variance_count
|
|
) {
|
|
longest_variance_count = data_variance[columns_headings[column_index]].length
|
|
}
|
|
|
|
}
|
|
column_index++
|
|
})
|
|
lines_count++
|
|
}
|
|
})
|
|
|
|
console.log('longest_variance_count', longest_variance_count)
|
|
|
|
utils.writeFile('variance.csv', writeCSVVariance())
|
|
// console.log('data_variance', data_variance)
|
|
|
|
})
|
|
}
|
|
|
|
console.log('parsing done')
|
|
// console.log('data_variance', data_variance)
|
|
})
|
|
|
|
/**
|
|
* écrit un csv avec les données de variance du dataset donné
|
|
*/
|
|
function writeCSVVariance() {
|
|
|
|
let csv_content = ';variance de ' + inputPath + ';' + new Date() + '\n'
|
|
let columns = Object.keys(data_variance);
|
|
|
|
// add headings
|
|
columns_headings.forEach((heading: string) => {
|
|
csv_content = csv_content + separator + heading
|
|
})
|
|
csv_content = csv_content + '\n'
|
|
// add max length of variance for each column
|
|
let ii = 0
|
|
columns.forEach((column: string) => {
|
|
// console.log('column', column, data_variance[column].length)
|
|
csv_content = csv_content + separator + data_variance[column].length
|
|
ii++
|
|
})
|
|
|
|
csv_content = csv_content + '\n\n'
|
|
// add content of values
|
|
for (let ii = 0; ii < longest_variance_count; ii++) {
|
|
csv_content = csv_content + '\n'
|
|
columns.forEach((column: any) => {
|
|
if (ii < data_variance[column].length) {
|
|
|
|
let currentValue = data_variance[column][ii]
|
|
csv_content = csv_content + separator + currentValue
|
|
} else {
|
|
csv_content = csv_content + separator
|
|
}
|
|
})
|
|
}
|
|
|
|
return csv_content;
|
|
}
|