study-docker-repro-longevity/analysis/artifact_analysis.py

#!/bin/python3

"""
    This script performs an artifact analysis on the outputs of the workflow
    to generate tables that can then be plotted by another program.
"""

import argparse
import csv
import os
import datetime

def artifact_changed(table, name):
    """
    Indicates whether the artifact of the given name has changed over time.
    An artifact becoming unavailable is considered as modified.

    Parameters
    ----------
    table: list
        Artifact hash log table.

    name: str
        Name of the artifact to check.

    Returns
    -------
    bool
        True if artifact changed, False otherwise.
    """
    changed = False
    i = 0
    artifact_hash = ""
    while i < len(table) and not changed:
        row = table[i]
        if row[2] == name:
            # If the first hash has not been saved yet:
            if artifact_hash == "":
                artifact_hash = row[1] # Hash is in the 2nd column
            elif row[1] != artifact_hash:
                changed = True
        i += 1
    return changed

def artifact_available(table, name):
    """
    Indicates whether the artifact of the given name is still available.

    Parameters
    ----------
    table: list
        Artifact hash log table.

    name: str
        Name of the artifact to check.

    Returns
    -------
    bool
        True if artifact is still available, False otherwise.
    """
    available = True
    for row in table:
        if row[2] == name:
            if row[1] == "-1":
                # -1 means the artifact could not be downloaded. Otherwise,
                # this column would contain the hash of the artifact.
                available = False
            else:
                available = True
    # The last log of the artifact hash will determine if the artifact is
    # currently available or not.
    return available

def analysis(input_table):
    """
    Analyzes the given artifact hash table to determine if the artifacts are
    still available and didn't change, changed, or aren't available anymore.

    Parameters
    ----------
    input_table: str
        Table to analyse.

    Returns
    -------
    dict
        Output table of the analysis in the form of a dict with headers as keys.
    """
    artifacts = {"available":0, "unavailable":0, "changed":0}
    checked = [] # Artifacts that have been checked already
    for row in input_table:
        artifact_name = row[2] # Name of the artifact in the 3rd column
        if artifact_name not in checked:
            if artifact_available(input_table, artifact_name):
                artifacts["available"] += 1
            else:
                artifacts["unavailable"] += 1
            if artifact_changed(input_table, artifact_name):
                artifacts["changed"] += 1
            checked.append(artifact_name)
    return artifacts

def main():
    # Command line arguments parsing:
    parser = argparse.ArgumentParser(
        prog = "artifact_analysis",
        description =
        """
        This script performs an artifact analysis on the outputs of the workflow
        to generate tables that can then be plotted by another program.
        The generated table gives the amount of artifacts that are available
        or not available, and the amount of artifacts that have been modified
        over time.
        """
    )
    parser.add_argument(
        "-v", "--verbose",
        action = "store_true",
        help = "Shows more details on what is being done."
    )
    parser.add_argument(
        "-i", "--input",
        action = "append",
        help =
        """
        The CSV file used as input for the analysis function. Multiple files
        can be specified by repeating this argument with different paths.
        All the input files must be artifact hash logs generated by ECG.
        """,
        required = True
    )
    parser.add_argument(
        "-o", "--output",
        help =
        """
        Path to the output CSV file that will be created by the analysis function.
        """,
        required = True
    )
    args = parser.parse_args()
    input_paths = args.input
    output_path = args.output

    # Parsing the input files:
    input_table = []
    for path in input_paths:
        input_file = open(path)
        input_table += list(csv.reader(input_file))
        input_file.close()

    # Analyzing the inputs:
    output_dict = analysis(input_table)
    # Adding the current time to every row:
    now = datetime.datetime.now()
    timestamp = str(datetime.datetime.timestamp(now))
    output_dict["timestamp"] = timestamp

    # Writing analysis to output file:
    output_file = open(output_path, "w+")
    dict_writer = csv.DictWriter(output_file, fieldnames=output_dict.keys())
    dict_writer.writeheader()
    dict_writer.writerow(output_dict)
    output_file.close()

if __name__ == "__main__":
    main()