diff --git a/README.md b/README.md index fb7cc38..a55ecdf 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,43 @@ # Study of the Reproducibility and Longevity of Dockerfiles -TODO: doc +ECG is a program that automates software environment checking for scientific artifacts. + +It is meant to be executed periodically to analyze variations in the software environment of the artifact through time. + +## How it works + +ECG takes as input a JSON configuration telling where to download the artifact, where to find the Dockerfile to build in the artifact, and which package managers are used by the Docker container. + +It will then download the artifact, build the Dockerfile, and then create a list of the installed packages in the Docker container. It also stores the potential errors encountered when building the Dockerfile, and logs the hash of the artifact for future comparison. + +## Setup + +A Linux operating system and the following packages are required: +- `snakemake` +- `gawk` +- `nickel` + +The following Python package is also required: +- `requests` + +Otherwise, you can use the Nix package manager and run `nix develop` in this directory to setup the full software environment. + +## Usage + +Run `ecg.py` as follow: + +``` +python3 ecg.py -p -l -b -a -c +``` + +Where: +- `` is the configuration file of the artifact in JSON format. An example is given in `artifacts_json/test.json`. +- `` is the path to the file where the package list generated by the program should be written. +- `` is the path to the file where to log the output of the program. +- `` is the path to the file where to write the build summary of the Docker image given in the configuration file. +- `` is the path to the file where to log the hash of the downloaded artifact. +- `` is the path to the cache directory, where downloaded artifacts will be stored for future usage. + +## License + +TBD \ No newline at end of file diff --git a/artifacts/ppopp23/gpu_to_cpu_transpilation_and_optimization_via_high_level_parallel_constructs.yaml b/artifacts_yaml/ppopp23/gpu_to_cpu_transpilation_and_optimization_via_high_level_parallel_constructs.yaml similarity index 100% rename from artifacts/ppopp23/gpu_to_cpu_transpilation_and_optimization_via_high_level_parallel_constructs.yaml rename to artifacts_yaml/ppopp23/gpu_to_cpu_transpilation_and_optimization_via_high_level_parallel_constructs.yaml diff --git a/artifacts/ppopp23/high_performance_and_scalable_agent_based_simulation_with_BioDynaMo.yaml b/artifacts_yaml/ppopp23/high_performance_and_scalable_agent_based_simulation_with_BioDynaMo.yaml similarity index 100% rename from artifacts/ppopp23/high_performance_and_scalable_agent_based_simulation_with_BioDynaMo.yaml rename to artifacts_yaml/ppopp23/high_performance_and_scalable_agent_based_simulation_with_BioDynaMo.yaml diff --git a/artifacts/ppopp23/tgopt_redundancy_aware-optimizations_for_temporal_graph_attention_network.yaml b/artifacts_yaml/ppopp23/tgopt_redundancy_aware-optimizations_for_temporal_graph_attention_network.yaml similarity index 100% rename from artifacts/ppopp23/tgopt_redundancy_aware-optimizations_for_temporal_graph_attention_network.yaml rename to artifacts_yaml/ppopp23/tgopt_redundancy_aware-optimizations_for_temporal_graph_attention_network.yaml diff --git a/artifacts/sc22/a_taxonomy_of_error_sources_in_hpc_io_machine_learning_models.yaml b/artifacts_yaml/sc22/a_taxonomy_of_error_sources_in_hpc_io_machine_learning_models.yaml similarity index 100% rename from artifacts/sc22/a_taxonomy_of_error_sources_in_hpc_io_machine_learning_models.yaml rename to artifacts_yaml/sc22/a_taxonomy_of_error_sources_in_hpc_io_machine_learning_models.yaml diff --git a/artifacts/sc22/approximate_copmuting_through_the_lens_of_uncertainty_quantification.yaml b/artifacts_yaml/sc22/approximate_copmuting_through_the_lens_of_uncertainty_quantification.yaml similarity index 100% rename from artifacts/sc22/approximate_copmuting_through_the_lens_of_uncertainty_quantification.yaml rename to artifacts_yaml/sc22/approximate_copmuting_through_the_lens_of_uncertainty_quantification.yaml diff --git a/artifacts/sc22/deinsum_practially_io_optimal_multi_linear_algebra-cpu.yaml b/artifacts_yaml/sc22/deinsum_practially_io_optimal_multi_linear_algebra-cpu.yaml similarity index 100% rename from artifacts/sc22/deinsum_practially_io_optimal_multi_linear_algebra-cpu.yaml rename to artifacts_yaml/sc22/deinsum_practially_io_optimal_multi_linear_algebra-cpu.yaml diff --git a/artifacts/sc22/deinsum_practially_io_optimal_multi_linear_algebra-gpu.yaml b/artifacts_yaml/sc22/deinsum_practially_io_optimal_multi_linear_algebra-gpu.yaml similarity index 100% rename from artifacts/sc22/deinsum_practially_io_optimal_multi_linear_algebra-gpu.yaml rename to artifacts_yaml/sc22/deinsum_practially_io_optimal_multi_linear_algebra-gpu.yaml diff --git a/artifacts/template.yaml b/artifacts_yaml/template.yaml similarity index 100% rename from artifacts/template.yaml rename to artifacts_yaml/template.yaml diff --git a/artifacts/test.yaml b/artifacts_yaml/test.yaml similarity index 100% rename from artifacts/test.yaml rename to artifacts_yaml/test.yaml diff --git a/ecg.py b/ecg.py index 0e488a8..42d9c7c 100755 --- a/ecg.py +++ b/ecg.py @@ -27,7 +27,7 @@ import sys config_path = "" pkglist_path = "" # Package list being generated buildstatus_path = "" # Summary of the build process of the image -arthashhist_path = "" # History of the hash of the downloaded artifact +arthashlog_path = "" # Log of the hash of the downloaded artifact cachedir_path = "" # Artifact cache directory # Commands to list installed packages along with their versions and the name @@ -133,12 +133,12 @@ def download_sources(config): artifact = tarfile.open(artifact_path) logging.info(f"Extracting artifact at {artifact_dir}") artifact.extractall(artifact_dir) - # Saving the current hash of the artifact for the history: - arthashhist_file = open(arthashhist_path, "a") + # Logging the current hash of the artifact: + arthashlog_file = open(arthashlog_path, "a") now = datetime.datetime.now() timestamp = str(datetime.datetime.timestamp(now)) - arthashhist_file.write(f"{timestamp},{artifact_hash}\n") - arthashhist_file.close() + arthashlog_file.write(f"{timestamp},{artifact_hash}\n") + arthashlog_file.close() else: logging.info(f"Cache found for {url}, skipping download") return artifact_dir @@ -265,7 +265,7 @@ def remove_image(config): subprocess.run(["docker", "rmi", name], capture_output = True) def main(): - global config_path, pkglist_path, buildstatus_path, arthashhist_path, cachedir_path + global config_path, pkglist_path, buildstatus_path, arthashlog_path, cachedir_path # Command line arguments parsing: parser = argparse.ArgumentParser( @@ -298,7 +298,7 @@ def main(): ) parser.add_argument( "-a", "--artifact-hash", - help = "Path to the file where to write the history of the hash of the downloaded artifact.", + help = "Path to the file where to write the log of the hash of the downloaded artifact.", required = True ) parser.add_argument( @@ -313,7 +313,7 @@ def main(): pkglist_path = args.pkg_list log_path = args.log_path buildstatus_path = args.build_summary - arthashhist_path = args.artifact_hash + arthashlog_path = args.artifact_hash cachedir_path = args.cache_dir # Setting up the log: will be displayed both on stdout and to the specified