study-docker-repro-longevity/analysis/softenv_analysis.py

213 lines
6.8 KiB
Python
Raw Normal View History

#!/bin/python3
"""
This script performs a software environment analysis on the outputs
of the workflow to generate tables that can then be plotted by another
program.
"""
import argparse
import csv
import os
import datetime
# All possible package sources, initialized to 0.
# This is required to make the column of the result table deterministic,
# so they can be determined without the header in the CSV file.
pkgsources = {"dpkg":0, "rpm":0, "pacman":0, "pip":0, "conda":0, "git":0, "misc":0}
def sources_stats(input_table, pkgsources):
"""
Analyzes the given package lists table to determine the number of artifacts
using a package manager, Git packages or misc packages.
Parameters
----------
input_table: str
Table to analyse.
pkgsources: dict
A dictionnary that contains all the possible package sources as keys,
with all keys' value initialized at 0.
Returns
-------
dict
Output table of the analysis in the form of a dict with headers as keys.
"""
i = 0
for row in input_table:
# Third column is the package source:
if row[2] not in pkgsources:
pkgsources[row[2]] = 1
else:
pkgsources[row[2]] += 1
return pkgsources
2024-08-07 12:26:36 +02:00
def pkg_changed(table, artifact_name, pkgname, pkgsource):
"""
Analyzes the given package lists table to determine if the given package
changed for the given artifact.
Parameters
----------
table: str
Table to analyse.
artifact_name: str
Name of the artifact for which we want to analyze package changes.
pkgname: str
The package we want to track changes.
pkgsource: str
Source of the package, in case there is multiple packages with the
same name but different sources.
Returns
-------
changed: bool
True if the version number of the package changed over time, False
otherwise.
"""
changed = False
i = 0
pkgver = ""
while i < len(table) and not changed:
row = table[i]
# Artifact name is in the 4th column, package name in the first,
# and package source in the 3rd:
if row[3] == artifact_name and row[0] == pkgname and row[2] == pkgsource:
# If the first version number has not been saved yet:
if pkgver == "":
pkgver = row[1] # Package version is in the 2nd column
elif row[1] != pkgver:
changed = True
i += 1
return changed
def pkgs_changes(input_table, pkgsources):
"""
Analyzes the given package lists table to determine the number of packages
that changed for every package source.
Parameters
----------
input_table: str
Table to analyse.
pkgsources: dict
A dictionnary that contains all the possible package sources as keys,
with all keys' value initialized at 0.
Returns
-------
dict
Output table of the analysis in the form of a dict with headers as keys.
"""
2024-08-07 12:26:36 +02:00
# Key is the artifact name, and value is a list of tuples constituted
# of the package that has been checked and its source for this artifact:
# FIXME: Memory usage?
checked_artifacts = {}
i = 0
for row in input_table:
2024-08-07 12:26:36 +02:00
artifact_name = row[3] # Artifact name is in the 4th column
if artifact_name not in checked_artifacts.keys():
checked_artifacts[artifact_name] = []
pkgname = row[0] # Package name is in the first column
pkgsource = row[2] # Package source is in the 3rd column
if (pkgname, pkgsource) not in checked_artifacts[artifact_name]:
if pkg_changed(input_table, artifact_name, pkgname, pkgsource):
pkgsources[pkgsource] += 1
2024-08-07 12:26:36 +02:00
checked_artifacts[artifact_name].append((pkgname, pkgsource))
return pkgsources
def pkgs_per_container(input_table):
print("ERROR: Not implemented!")
def main():
# Command line arguments parsing:
parser = argparse.ArgumentParser(
prog = "softenv_analysis",
description =
"""
This script performs a software environment analysis on the outputs
of the workflow to generate tables that can then be plotted
by another program.
"""
)
parser.add_argument(
"-v", "--verbose",
action = "store_true",
help = "Shows more details on what is being done."
)
parser.add_argument(
"-t", "--analysis-type",
help =
"""
Specify the type of software analysis to run. Depending on the
type of analysis, multiple tables can be generated:
the number of packages per source (a package manager, git or misc)
by using `sources-stats`,
the number of packages that changed over time (0 if only
one file is given, since it will only include the package list
of a single execution) by using `pkg-changes`,
the number of packages per container by specifying `pkgs-per-container`.
""",
choices = ["sources-stats", "pkgs-changes", "pkgs-per-container"],
required = True
)
parser.add_argument(
"-i", "--input",
action = "append",
nargs = "+",
help =
"""
The CSV file used as input for the analysis function. Multiple files
can be specified at once by separating them with a space.
All the input files must be package lists generated by ECG.
""",
required = True
)
parser.add_argument(
"-o", "--output",
help =
"""
Path to the output CSV file that will be created by the analysis function.
""",
required = True
)
args = parser.parse_args()
inputs = args.input
output_path = args.output
analysis_type = args.analysis_type
# Parsing the input files:
input_table = []
for i in inputs:
for path in i:
input_file = open(path)
input_table += list(csv.reader(input_file))
input_file.close()
# Analyzing the inputs:
if analysis_type == "sources-stats":
output_dict = sources_stats(input_table, pkgsources)
elif analysis_type == "pkgs-changes":
output_dict = pkgs_changes(input_table, pkgsources)
elif analysis_type == "pkgs-per-container":
output_dict = pkgs_per_container(input_table)
# Adding the current time to every row:
now = datetime.datetime.now()
timestamp = str(datetime.datetime.timestamp(now))
output_dict["timestamp"] = timestamp
# Writing analysis to output file:
output_file = open(output_path, "w+")
dict_writer = csv.DictWriter(output_file, fieldnames=output_dict.keys())
# dict_writer.writeheader()
dict_writer.writerow(output_dict)
output_file.close()
if __name__ == "__main__":
main()