study-docker-repro-longevity/ecg.py

305 lines
10 KiB
Python
Raw Normal View History

2024-07-09 18:25:21 +02:00
#!/bin/python3
"""
ECG is a program that automates software environment checking
for scientific artifacts.
It is meant to be executed periodically to analyze variations in the
software environment of the artifact through time.
"""
2024-07-09 18:25:21 +02:00
import subprocess
import json
import yaml
import argparse
import tempfile
import os
import requests
import zipfile
import io
import tarfile
import pathlib
import logging
import datetime
import sys
# Paths:
config_path = ""
pkglist_path = "" # Package list being generated
buildstatus_path = "" # Summary of the build process of the image
cachedir_path = "" # Artifact cache directory
2024-07-09 18:25:21 +02:00
# Commands to list installed packages along with their versions and the name
# of the package manager, depending on the package managers.
# Each package manager is associated with a tuple, the first item being
# the query command, and the second being the command that will format
# the output of the query command (this one can be an empty string in case
# the formatting part is already done using the options of the first command).
# The first needs to be run on the container, and the second on the host,
# to take into account container images that do not have the formatting
# packages installed.
pkgmgr_cmd = {
"dpkg": ("dpkg -l", "awk 'NR>5 {print $2 \",\" $3 \",\" \"dpkg\"}'"), \
"rpm":("rpm -qa --queryformat '%{NAME},%{VERSION},rpm\\n'", ""), \
"pacman":("pacman -Q", "awk '{print $0 \",\" $1 \",pacman\"}'"), \
"pip":("pip freeze", "sed 's/==/,/g' | awk '{print $0 \",pip\"}'"), \
"conda":("/root/.conda/bin/conda list -e", "sed 's/=/ /g' | awk 'NR>3 {print $1 \",\" $2 \",conda\"}'")
}
2024-07-09 18:25:21 +02:00
# Possible error messages given by 'docker build' and their category.
# The key is the category, the value is a tuple of error messages belonging to
# to this category:
build_errors = {
"package_unavailable":("Unable to locate package"),
"baseimage_unavailable":("manifest unknown: manifest unknown")
}
2024-07-10 14:50:57 +02:00
# Command to obtain the latest commit hash in a git repository:
gitcmd = "git log -n 1 --pretty=format:%H"
def trim(url) :
"""
Trims given url for cache storage.
Parameters
----------
url: str
URL to trim.
Returns
-------
str
Trimmed URL.
"""
trimmed = ""
for c in url.lower():
if c not in "/:;\\'\" *?":
trimmed += c
return trimmed
2024-07-09 18:25:21 +02:00
def download_sources(config):
"""
Downloads the source of the artifact in 'config'.
Parameters
----------
config: dict
Parsed YAML config file.
Returns
-------
temp_dir: tempfile.TemporaryDirectory
The directory where the artifact is downloaded to.
"""
2024-07-09 18:25:21 +02:00
url = config["artifact_url"]
artifact_name = trim(url)
artifact_dir = f"{cachedir_path}/{artifact_name}"
# Checking if artifact in cache. Not downloading if it is:
if not os.path.exists(artifact_dir):
logging.info(f"Downloading artifact from {url}")
os.mkdir(artifact_dir)
req = requests.get(url)
if config["type"] == "zip":
artifact = zipfile.ZipFile(io.BytesIO(req.content))
elif config["type"] == "tgz":
artifact = tarfile.open(fileobj=io.BytesIO(req.content))
logging.info(f"Extracting artifact at {artifact_dir}")
artifact.extractall(artifact_dir)
else:
logging.info(f"Cache found for {url}, skipping download")
return artifact_dir
2024-07-09 18:25:21 +02:00
def buildstatus_saver(output):
"""
Parses the given 'output' to indentify the errors, then saves them to the
'build_status' file.
Parameters
----------
output: str
The output of Docker.
Returns
-------
None
"""
file_exists = os.path.exists(buildstatus_path)
buildstatus_file = open(buildstatus_path, "a")
# # Writing header in case file didn't exist:
# if not file_exists:
# buildstatus_file.write("yaml_path,timestamp,error")
for error_cat, error in build_errors.items():
if error in output:
now = datetime.datetime.now()
timestamp = str(datetime.datetime.timestamp(now))
buildstatus_file.write(f"{config_path},{timestamp},{error_cat}\n")
buildstatus_file.close()
2024-07-09 18:25:21 +02:00
def build_image(config, src_dir):
"""
Builds the given Docker image in 'config'.
Parameters
----------
config: dict
Parsed YAML config file.
src_dir: tempfile.TemporaryDirectory
The directory where the artifact is stored.
Returns
-------
bool
'True' if build successful, 'False' otherwise.
"""
name = config["image_name"]
2024-07-09 18:25:21 +02:00
logging.info(f"Starting building image {name}")
path = os.path.join(src_dir, config["dockerfile_location"])
build_command = f"docker build -t {config["image_name"]} ."
2024-07-12 11:21:14 +02:00
build_process = subprocess.run(build_command.split(" "), cwd=path, capture_output=True)
build_output = f"stdout:\n{build_process.stdout.decode("utf-8")}\nstderr:\n{build_process.stderr.decode("utf-8")}"
# build_output = build_process.stderr.decode("utf-8")
2024-07-12 11:21:14 +02:00
logging.info(f"Output of '{build_command}':")
logging.info(build_output)
2024-07-09 18:25:21 +02:00
return_code = build_process.returncode
logging.info(f"Command '{build_command}' exited with code {return_code}")
buildstatus_saver(build_output)
2024-07-09 18:25:21 +02:00
return return_code == 0
def check_env(config, src_dir):
"""
Builds a list of all software packages installed in the
Docker image given in 'config', depending on the package managers
specified in the configuration, then stores it in a CSV file.
Parameters
----------
config: dict
Parsed YAML config file.
src_dir: tempfile.TemporaryDirectory
The directory where the artifact is stored.
Returns
-------
None
"""
logging.info("Checking software environment")
pkglist_file = open(pkglist_path, "w")
# pkglist_file.write("package,version,package_manager\n")
path = os.path.join(src_dir, config["dockerfile_location"])
2024-07-09 18:25:21 +02:00
for pkgmgr in config["package_managers"]:
pkglist_cmd = pkgmgr_cmd[pkgmgr][0]
listformat_cmd = pkgmgr_cmd[pkgmgr][1]
2024-07-09 18:25:21 +02:00
logging.info(f"Checking '{pkgmgr}'")
pkglist_process = subprocess.run(["docker", "run", "--rm", config["image_name"]] + pkglist_cmd.split(" "), cwd=path, capture_output=True)
format_process = subprocess.run(f"cat << EOF | {listformat_cmd}\n{pkglist_process.stdout.decode("utf-8")}EOF", cwd=path, capture_output=True, shell=True)
pkglist = format_process.stdout.decode("utf-8")
pkglist_file.write(pkglist)
if "git_packages" in config.keys():
2024-07-10 14:50:57 +02:00
logging.info("Checking Git packages")
for repo in config["git_packages"]:
pkglist_process = subprocess.run(["docker", "run", "--rm", "-w", repo["location"], config["image_name"]] + gitcmd.split(" "), cwd=path, capture_output=True)
repo_row = f"{repo["name"]},{pkglist_process.stdout.decode("utf-8")},git"
pkglist_file.write(f"{repo_row}\n")
if "misc_packages" in config.keys():
logging.info("Checking packages obtained outside of a package manager or VCS")
for pkg in config["misc_packages"]:
logging.info(f"Downloading package {pkg["name"]} from {pkg["url"]}")
req = requests.get(pkg["url"])
pkg_file = tempfile.NamedTemporaryFile()
pkg_file.write(req.content)
pkglist_process = subprocess.run(f"sha256sum {pkg_file.name} | cut -zd ' ' -f 1", cwd=path, capture_output=True, shell=True)
pkg_row = f"{pkg["name"]},{pkglist_process.stdout.decode("utf-8")},misc"
pkglist_file.write(f"{pkg_row}\n")
pkglist_file.close()
2024-07-09 18:25:21 +02:00
def remove_image(config):
"""
Removes the Docker image given in 'config'.
Parameters
----------
config: dict
Parsed YAML config file.
Returns
-------
None
"""
name = config["image_name"]
2024-07-09 18:25:21 +02:00
logging.info(f"Removing image '{name}'")
2024-07-12 11:21:14 +02:00
subprocess.run(["docker", "rmi", name], capture_output = True)
2024-07-09 18:25:21 +02:00
def main():
global config_path, pkglist_path, buildstatus_path, cachedir_path
# Command line arguments parsing:
2024-07-09 18:25:21 +02:00
parser = argparse.ArgumentParser(
prog = "ecg",
description = "ECG is a program that automates software environment checking for scientific artifacts. "
"It is meant to be executed periodically to analyze variations in the software environment of the artifact through time."
)
parser.add_argument('-v', '--verbose',
action = 'store_true',
help = "Shows more details on what is being done."
)
parser.add_argument(
"config",
help = "The path to the configuration file of the artifact's Docker image."
)
parser.add_argument(
"-p", "--pkg-list",
help = "Path to the file where the package list generated by the program should be written.",
required = True
)
parser.add_argument(
"-l", "--log-path",
help = "Path to the file where to log the output of the program.",
required = True
)
parser.add_argument(
"-b", "--build-summary",
help = "Path to the file where to write the build summary of the Docker image given in the configuration file.",
required = True
)
parser.add_argument(
"-c", "--cache-dir",
help = "Path to the cache directory, where artifact that are downloaded will be stored for future usage.",
required = True
)
2024-07-09 18:25:21 +02:00
args = parser.parse_args()
# Setting up the paths of the outputs:
log_path = "log.txt" # Output of the program
pkglist_path = args.pkg_list
log_path = args.log_path
buildstatus_path = args.build_summary
cachedir_path = args.cache_dir
2024-07-12 11:21:14 +02:00
# Setting up the log: will be displayed both on stdout and to the specified
# file:
print(f"Output will be stored in {log_path}")
2024-07-12 11:21:14 +02:00
logging.basicConfig(filename = log_path, filemode = "w", format = '%(levelname)s: %(message)s', level = logging.INFO)
# Parsing the input YAML file including the configuration of
# the artifact's image:
config_path = args.config
config_file = open(config_path, "r")
config = yaml.safe_load(config_file)
config_file.close()
verbose = args.verbose
if verbose:
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
src_dir = download_sources(config)
successful_build = build_image(config, src_dir)
if successful_build:
check_env(config, src_dir)
remove_image(config)
2024-07-09 18:25:21 +02:00
if __name__ == "__main__":
main()