study-docker-repro-longevity/analysis/artifact_analysis.py

167 lines
4.8 KiB
Python
Executable File

#!/bin/python3
"""
This script performs an artifact analysis on the outputs of the workflow
to generate tables that can then be plotted by another program.
"""
import argparse
import csv
import os
import datetime
def artifact_changed(table, name):
"""
Indicates whether the artifact of the given name has changed over time.
An artifact becoming unavailable is considered as modified.
Parameters
----------
table: list
Artifact hash log table.
name: str
Name of the artifact to check.
Returns
-------
bool
True if artifact changed, False otherwise.
"""
changed = False
i = 0
artifact_hash = ""
while i < len(table) and not changed:
row = table[i]
if row[2] == name:
# If the first hash has not been saved yet:
if artifact_hash == "":
artifact_hash = row[1] # Hash is in the 2nd column
elif row[1] != artifact_hash:
changed = True
i += 1
return changed
def artifact_available(table, name):
"""
Indicates whether the artifact of the given name is still available.
Parameters
----------
table: list
Artifact hash log table.
name: str
Name of the artifact to check.
Returns
-------
bool
True if artifact is still available, False otherwise.
"""
available = True
for row in table:
if row[2] == name:
if row[1] == "-1":
# -1 means the artifact could not be downloaded. Otherwise,
# this column would contain the hash of the artifact.
available = False
else:
available = True
# The last log of the artifact hash will determine if the artifact is
# currently available or not.
return available
def analysis(input_table):
"""
Analyzes the given artifact hash table to determine if the artifacts are
still available and didn't change, changed, or aren't available anymore.
Parameters
----------
input_table: str
Table to analyse.
Returns
-------
dict
Output table of the analysis in the form of a dict with headers as keys.
"""
artifacts = {"available":0, "unavailable":0, "changed":0}
checked = [] # Artifacts that have been checked already
for row in input_table:
artifact_name = row[2] # Name of the artifact in the 3rd column
if artifact_name not in checked:
if artifact_available(input_table, artifact_name):
artifacts["available"] += 1
else:
artifacts["unavailable"] += 1
if artifact_changed(input_table, artifact_name):
artifacts["changed"] += 1
checked.append(artifact_name)
return artifacts
def main():
# Command line arguments parsing:
parser = argparse.ArgumentParser(
prog = "artifact_analysis",
description =
"""
This script performs an artifact analysis on the outputs of the workflow
to generate tables that can then be plotted by another program.
The generated table gives the amount of artifacts that are available
or not available, and the amount of artifacts that have been modified
over time.
"""
)
parser.add_argument(
"-v", "--verbose",
action = "store_true",
help = "Shows more details on what is being done."
)
parser.add_argument(
"-i", "--input",
action = "append",
help =
"""
The CSV file used as input for the analysis function. Multiple files
can be specified by repeating this argument with different paths.
All the input files must be artifact hash logs generated by ECG.
""",
required = True
)
parser.add_argument(
"-o", "--output",
help =
"""
Path to the output CSV file that will be created by the analysis function.
""",
required = True
)
args = parser.parse_args()
input_paths = args.input
output_path = args.output
# Parsing the input files:
input_table = []
for path in input_paths:
input_file = open(path)
input_table += list(csv.reader(input_file))
input_file.close()
# Analyzing the inputs:
output_dict = analysis(input_table)
# Adding the current time to every row:
now = datetime.datetime.now()
timestamp = str(datetime.datetime.timestamp(now))
output_dict["timestamp"] = timestamp
# Writing analysis to output file:
output_file = open(output_path, "w+")
dict_writer = csv.DictWriter(output_file, fieldnames=output_dict.keys())
dict_writer.writeheader()
dict_writer.writerow(output_dict)
output_file.close()
if __name__ == "__main__":
main()