154 lines
5.0 KiB
TypeScript
154 lines
5.0 KiB
TypeScript
/**
|
|
prendre un CSV,
|
|
examiner toutes les colonnes et leurs valeurs,
|
|
garder en mémoire les valeurs uniques de chaque colonne
|
|
faire un nouveau csv qui ne montre que les valeurs uniques pour chacune des colonnes
|
|
et qui compte le nombre de valeurs
|
|
**/
|
|
import utils from './mappings/utils'
|
|
import {parse} from 'csv'
|
|
|
|
const fs = require('fs')
|
|
const minimist = require('minimist')
|
|
|
|
let mini_arguments: any = minimist(process.argv.slice(2))
|
|
|
|
// interface VarianceType {
|
|
// [key: string]: Array<string>
|
|
// }
|
|
|
|
let csv_content = 'variance de dataset\n';
|
|
let separator = ';';
|
|
let data_variance: any = {};
|
|
|
|
const inputPath = './etalab_data/toilettes/sanisettesparis_reworked.csv'
|
|
// let inputPath = './etalab_data/toilettes/small_datas.csv'
|
|
let outputPath = 'etalab_data/toilettes/'
|
|
if (mini_arguments['source']) {
|
|
inputPath = mini_arguments['source']
|
|
}
|
|
let columns_headings: Array<string> = [];
|
|
let lines_count = 0;
|
|
let longest_variance_count = 0;
|
|
|
|
|
|
function getColumnsFromRow(row: string) {
|
|
let headings: any = []
|
|
console.log('elem', row)
|
|
headings = row.split(separator)
|
|
return headings
|
|
}
|
|
|
|
console.log('open file ', inputPath)
|
|
fs.readFile(inputPath, function (err: any, fileData: any) {
|
|
|
|
if (err) {
|
|
throw new Error(err)
|
|
} else {
|
|
parse(fileData, {columns: false, trim: true}, function (err: any, lines: any) {
|
|
// Your CSV data is in an array of arrays passed to this callback as rows.
|
|
|
|
if (err) {
|
|
throw new Error(err)
|
|
}
|
|
console.log('line ', lines_count)
|
|
|
|
lines.forEach((line: any) => {
|
|
|
|
line = line[0]
|
|
|
|
if (lines_count === 0) {
|
|
columns_headings = getColumnsFromRow(line)
|
|
console.log('columns_headings.length', columns_headings.length)
|
|
// console.log('columns_headings', columns_headings)
|
|
let headers = Object.keys(columns_headings)
|
|
columns_headings.forEach((header: string) => {
|
|
data_variance[header] = []
|
|
})
|
|
console.log('data_variance', data_variance)
|
|
} else {
|
|
// lignes suivantes
|
|
|
|
let column_index = 0
|
|
|
|
|
|
line.split(separator).forEach((value: string) => {
|
|
value = value.trim()
|
|
let column_header_current = columns_headings[column_index]
|
|
// console.log('column_index', column_index)
|
|
// dans chaque colonne, vérifier que la valeur n'est pas déjà présente
|
|
// dans les index de variance
|
|
// si la valeur est nouvelle, l'ajouter
|
|
|
|
if (data_variance[column_header_current].indexOf(value) === -1) {
|
|
data_variance[column_header_current].push(value)
|
|
if (
|
|
data_variance[column_header_current].length > longest_variance_count
|
|
) {
|
|
longest_variance_count = data_variance[column_header_current].length
|
|
}
|
|
|
|
}else{
|
|
console.log('value',value,' déjà présente dans la collection',column_header_current )
|
|
}
|
|
column_index++
|
|
|
|
})
|
|
|
|
}
|
|
|
|
lines_count++
|
|
})
|
|
|
|
console.log('lines_count', lines_count)
|
|
console.log('longest_variance_count', longest_variance_count)
|
|
|
|
utils.writeFile('variance.csv', writeCSVVariance(), outputPath)
|
|
// console.log('data_variance', data_variance)
|
|
|
|
})
|
|
}
|
|
|
|
console.log('parsing done')
|
|
// console.log('data_variance', data_variance)
|
|
})
|
|
|
|
/**
|
|
* écrit un csv avec les données de variance du dataset donné
|
|
*/
|
|
function writeCSVVariance() {
|
|
|
|
let csv_content = ';variance de ' + inputPath + ';' + new Date() + '\n'
|
|
let columns = Object.keys(data_variance);
|
|
|
|
// add headings
|
|
columns_headings.forEach((heading: string) => {
|
|
csv_content = csv_content + separator + heading
|
|
})
|
|
csv_content = csv_content + '\n'
|
|
// add max length of variance for each column
|
|
let ii = 0
|
|
columns.forEach((column: string) => {
|
|
// console.log('column', column, data_variance[column].length)
|
|
csv_content = csv_content + separator + data_variance[column].length
|
|
ii++
|
|
})
|
|
|
|
csv_content = csv_content + '\n\n'
|
|
// add content of values
|
|
for (let ii = 0; ii < longest_variance_count; ii++) {
|
|
csv_content = csv_content + '\n'
|
|
columns.forEach((column: any) => {
|
|
if (ii < data_variance[column].length) {
|
|
|
|
let currentValue = data_variance[column][ii]
|
|
csv_content = csv_content + separator + currentValue
|
|
} else {
|
|
csv_content = csv_content + separator
|
|
}
|
|
})
|
|
}
|
|
|
|
return csv_content;
|
|
}
|