study-docker-repro-longevity/ecg.py

501 lines
18 KiB
Python
Executable File

#!/bin/python3
"""
ECG is a program that automates software environment checking
for scientific artifacts.
It is meant to be executed periodically to analyze variations in the
software environment of the artifact through time.
"""
import subprocess
import json
import argparse
import tempfile
import os
import requests
import zipfile
import tarfile
import pathlib
import logging
import datetime
import sys
import string
import traceback
def trim(url):
"""
Trims given URL to make it contain only lowercase letters and numbers,
as well as with a maximum length of 128.
Parameters
----------
url: str
URL to trim.
Returns
-------
str
Trimmed URL.
"""
trimmed = ""
url_lc = url.lower()
i = 0
while i < len(url_lc) and i < 128:
c = url_lc[i]
if c in string.ascii_lowercase or c in [str(x) for x in range(0, 10)]:
trimmed += c
i += 1
return trimmed
def download_file(url, dest):
"""
Downloads the file stored at the given URL and returns its hash
and location.
Parameters
----------
url: str
URL to the file to download.
dest: str
Path to where the file should be stored.
Returns
-------
str
Hash of the downloaded file, or empty string if download failed.
"""
file_hash = "-1"
try:
req = requests.get(url)
if req.status_code != 404:
file = open(dest, "wb")
file.write(req.content)
file.close()
hash_process = subprocess.run(f"sha256sum {file.name} | cut -d ' ' -f 1 | tr -d '\n'", capture_output=True, shell=True)
file_hash = hash_process.stdout.decode("utf-8")
except requests.exceptions.ConnectionError:
# We can just ignore this exception, as we will just return an empty
# hash to indicate the error:
pass
return file_hash
def download_sources(config, arthashlog_path, dl_dir, use_cache, artifact_name):
"""
Downloads the source of the artifact in 'config'.
Parameters
----------
config: dict
Parsed config file.
arthashlog_path: str
Path to the artifact hash log file.
dl_dir: str
Path to the directory where to download the artifact.
use_cache: bool
Indicates whether the cache should be used or not.
artifact_name: str
Name of the artifact, for the artifact hash log.
Returns
-------
temp_dir: str
Path to the directory where the artifact is downloaded to, or empty
string if download failed.
"""
url = config["artifact_url"]
artcache_dir = trim(url)
artifact_dir = os.path.join(dl_dir, artcache_dir)
# Checking if artifact in cache. Not downloading if it is:
if not os.path.exists(artifact_dir) or not use_cache:
logging.info(f"Downloading artifact from {url}")
# In case cache was used before:
if not use_cache:
os.system(f"rm -rf {artifact_dir}")
os.mkdir(artifact_dir)
artifact_file = tempfile.NamedTemporaryFile()
artifact_path = artifact_file.name
artifact_hash = download_file(url, artifact_path)
# If download was successful:
if artifact_hash != "-1":
if config["type"] == "zip":
artifact = zipfile.ZipFile(artifact_path)
elif config["type"] == "tar":
artifact = tarfile.open(artifact_path)
logging.info(f"Extracting artifact at {artifact_dir}")
artifact.extractall(artifact_dir)
# If download failed:
else:
os.rmdir(artifact_dir)
artifact_dir = ""
# Logging the current hash of the artifact:
arthashlog_file = open(arthashlog_path, "a")
now = datetime.datetime.now()
timestamp = str(datetime.datetime.timestamp(now))
# Artifact hash will be an empty string if download failed:
arthashlog_file.write(f"{timestamp},{artifact_hash},{artifact_name}\n")
arthashlog_file.close()
else:
logging.info(f"Cache found for {url}, skipping download")
return artifact_dir
def builderror_identifier(output):
"""
Parses the given 'output' to indentify the error.
Parameters
----------
output: str
Output of Docker.
Returns
-------
found_error: str
The error that has been found in the output, according to the
categories. If there is more than one, only the latest is taken into
account.
"""
# Possible error messages given by 'docker build' and their category.
# The key is the category, the value is a tuple of error messages belonging to
# to this category:
build_errors = {
"package_install_failed":("Unable to locate package", "error: failed to compile"),
"baseimage_unavailable":("manifest unknown: manifest unknown",),
"dockerfile_not_found":("Dockerfile: no such file or directory",)
}
# Last error found is the right one in theory:
found_error = ""
unknown_error = True
for error_cat, error_msgs in build_errors.items():
for error in error_msgs:
if error in output:
unknown_error = False
found_error = error_cat
if unknown_error:
found_error = "unknown_error"
return found_error
def buildresult_saver(result, buildstatus_path, config_path):
"""
Saves the given result in the 'build_status' file.
Parameters
----------
result: str
The result of the build. Either a Docker 'build' error
(see 'builderror_identifier'), another type of error
(for instance 'artifact_unavailable'), or 'success'
if build is successful.
buildstatus_path: str
Path to the build status file.
config_path: str
Path to the config file.
Returns
-------
None
"""
buildstatus_file = open(buildstatus_path, "a")
artifact_name = os.path.basename(config_path).split(".")[0]
# # Writing header in case file didn't exist:
# if not file_exists:
# buildstatus_file.write("yaml_path,timestamp,error")
now = datetime.datetime.now()
timestamp = str(datetime.datetime.timestamp(now))
buildstatus_file.write(f"{artifact_name},{timestamp},{result}\n")
buildstatus_file.close()
def build_image(config, src_dir, image_name, docker_cache = False):
"""
Builds the given Docker image in 'config'.
Parameters
----------
config: dict
Parsed config file.
src_dir: str
Path to the directory where the artifact is stored.
image_name: str
Name of the Docker image.
docker_cache: bool
Enables or disables Docker 'build' cache.
Returns
-------
return_code: bool, build_output: str
Return code and output of Docker 'build'.
"""
cache_arg = " --no-cache"
if docker_cache:
cache_arg = ""
logging.info(f"Starting building image {image_name}")
path = os.path.join(src_dir, config["buildfile_dir"])
# Using trimmed artifact URL as name:
build_command = f"docker build{cache_arg} -t {image_name} ."
build_process = subprocess.run(build_command.split(" "), cwd=path, capture_output=True)
build_output = f"stdout:\n{build_process.stdout.decode('utf-8')}\nstderr:\n{build_process.stderr.decode('utf-8')}"
logging.info(f"Output of '{build_command}':")
logging.info(build_output)
return_code = build_process.returncode
logging.info(f"Command '{build_command}' exited with code {return_code}")
return return_code, build_output
def check_env(config, src_dir, artifact_name, pkglist_path):
"""
Builds a list of all software packages installed in the
Docker image given in 'config', depending on the package managers
specified in the configuration, then stores it in a CSV file.
Parameters
----------
config: dict
Parsed config file.
src_dir: str
Path to the directory where the artifact is stored.
artifact_name: str
Name of the artifact. Used both as the Docker image name, and for the
packages list for tracking purpose during the output analysis.
pkglist_path: str
Path to the package list file.
Returns
-------
None
"""
# Saving the current time to add it to every row:
now = datetime.datetime.now()
timestamp = str(datetime.datetime.timestamp(now))
# Commands to list installed packages along with their versions and the name
# of the package manager, depending on the package managers.
# Each package manager is associated with a tuple, the first item being
# the package manager's command, the second being the arguments for the
# query (they must be separated for the "--entrypoint" argument of Docker
# 'run', see below), and the third one being the command that will format
# the output of the query command (this one can be an empty string in case
# the formatting part is already done using the options of the first command).
# The first command needs to be run on the container, and the second on the
# host, to take into account container images that do not have the formatting
# packages installed.
pkgmgr_cmd = {
"dpkg": ("dpkg", "-l", "awk 'NR>5 {print $2 \",\" $3 \",dpkg," + artifact_name + "," + timestamp + "\"}'"), \
"rpm":("rpm", "-qa --queryformat '%{NAME},%{VERSION},rpm," + artifact_name + "," + timestamp + "\\n'", ""), \
"pacman":("pacman", "-Q", "awk '{print $0 \",\" $1 \",pacman," + artifact_name + "," + timestamp + "\"}'"), \
"pip":("pip", "list", "awk 'NR>2 {print $1 \",\" $2 \",\" \"pip," + artifact_name + "," + timestamp + "\"}'"), \
"conda":("/root/.conda/bin/conda", "list -e", "sed 's/=/ /g' | awk 'NR>3 {print $1 \",\" $2 \",conda," + artifact_name + "," + timestamp + "\"}'")
}
# Command to obtain the latest commit hash in a git repository (separated
# into 2 parts for "--entrypoint"):
gitcmd = ("git", "log -n 1 --pretty=format:%H")
logging.info("Checking software environment")
pkglist_file = open(pkglist_path, "w")
# pkglist_file.write("package,version,package_manager\n")
path = os.path.join(src_dir, config["buildfile_dir"])
# Package managers:
for pkgmgr in config["package_managers"]:
# "--entrypoint" requires command and arguments to be separated.
# This Docker 'run' option is used to prevent the shell from printing
# a login message, if any.
pkglist_cmd = pkgmgr_cmd[pkgmgr][0]
pkglist_cmdargs = pkgmgr_cmd[pkgmgr][1].split(" ")
listformat_cmd = pkgmgr_cmd[pkgmgr][2]
logging.info(f"Checking '{pkgmgr}'")
# pkglist_process = subprocess.run(["docker", "run", "--rm", config["image_name"]] + pkglist_cmd.split(" "), cwd=path, capture_output=True)
pkglist_process = subprocess.run(["docker", "run", "--rm", "--entrypoint", pkglist_cmd, artifact_name] + pkglist_cmdargs, cwd=path, capture_output=True)
format_process = subprocess.run(f"cat << EOF | {listformat_cmd}\n{pkglist_process.stdout.decode('utf-8')}EOF", cwd=path, capture_output=True, shell=True)
pkglist = format_process.stdout.decode("utf-8")
pkglist_file.write(pkglist)
# Python venvs:
logging.info("Checking Python venvs")
for venv in config["python_venvs"]:
pipcmd = pkgmgr_cmd["pip"][0]
pipcmd_args = pkgmgr_cmd["pip"][1]
pkglist_process = subprocess.run(["docker", "run", "--rm", "-w", venv["path"], "--entrypoint", venv["path"] + "/bin/" + pipcmd, artifact_name] + pipcmd_args.split(" "), cwd=path, capture_output=True)
format_process = subprocess.run(f"cat << EOF | {listformat_cmd}\n{pkglist_process.stdout.decode('utf-8')}EOF", cwd=path, capture_output=True, shell=True)
pkglist = format_process.stdout.decode("utf-8")
pkglist_file.write(pkglist)
# Git packages:
logging.info("Checking Git packages")
for repo in config["git_packages"]:
pkglist_process = subprocess.run(["docker", "run", "--rm", "-w", repo["location"], "--entrypoint", gitcmd[0], artifact_name] + gitcmd[1].split(" "), cwd=path, capture_output=True)
repo_row = f"{repo['name']},{pkglist_process.stdout.decode('utf-8')},git,{artifact_name},{timestamp}"
pkglist_file.write(f"{repo_row}\n")
# Misc packages:
logging.info("Checking miscellaneous packages")
for pkg in config["misc_packages"]:
logging.info(f"Downloading package {pkg['name']} from {pkg['url']}")
pkg_file = tempfile.NamedTemporaryFile()
pkg_path = pkg_file.name
pkg_hash = download_file(pkg["url"], pkg_path)
# Package hash will be an empty string if download failed:
pkg_row = f"{pkg['name']},{pkg_hash},misc,{artifact_name},{timestamp}"
pkglist_file.write(f"{pkg_row}\n")
pkglist_file.close()
def remove_image(config, image_name):
"""
Removes the Docker image given in 'config'.
Parameters
----------
config: dict
Parsed config file.
image_name: str
Name of the Docker image.
Returns
-------
None
"""
logging.info(f"Removing image '{image_name}'")
subprocess.run(["docker", "rmi", image_name], capture_output = True)
def main():
# Paths:
config_path = ""
pkglist_path = "" # Package list being generated
buildstatus_path = "" # Status of the build process of the image, when it fails
arthashlog_path = "" # Log of the hash of the downloaded artifact
cache_dir = "" # Artifact cache directory, when using one. 'None' value indicates no cache.
use_cache = False
# Command line arguments parsing:
parser = argparse.ArgumentParser(
prog = "ecg",
description =
"""
ECG is a program that automates software environment checking for scientific artifacts.
It is meant to be executed periodically to analyze variations in the software environment of the artifact through time.
"""
)
# parser.add_argument(
# '-v', '--verbose',
# action = 'store_true',
# help = "Shows more details on what is being done."
# )
parser.add_argument(
"config",
help = "The path to the configuration file of the artifact's Docker image."
)
parser.add_argument(
"-p", "--pkg-list",
help = "Path to the file where the package list generated by the program should be written.",
required = True
)
# parser.add_argument(
# "-l", "--log-path",
# help = "Path to the file where to log the output of the program.",
# required = True
# )
parser.add_argument(
"-b", "--build-status",
help = "Path to the file where to write the build status of the Docker image given in the configuration file.",
required = True
)
parser.add_argument(
"-a", "--artifact-hash",
help = "Path to the file where to write the log of the hash of the downloaded artifact.",
required = True
)
parser.add_argument(
"-c", "--cache-dir",
help =
"""
Path to the cache directory, where artifacts that are downloaded will be stored for future usage.
If not specified, cache is disabled.
""",
required = False
),
parser.add_argument(
'--docker-cache',
action = 'store_true',
help = "Use cache for Docker 'build'."
)
args = parser.parse_args()
# Setting up the paths of the outputs:
pkglist_path = args.pkg_list
buildstatus_path = args.build_status
arthashlog_path = args.artifact_hash
cache_dir = args.cache_dir
# log_path = "log.txt" # Output of the program
# log_path = args.log_path
# Creating the output files to avoid complaints from Snakemake about missing
# outputs...
pathlib.Path(pkglist_path).touch()
pathlib.Path(buildstatus_path).touch()
pathlib.Path(arthashlog_path).touch()
# Setting up the log:
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
# # Old version where the script writes its own log to the given file:
# print(f"Output will be stored in {log_path}")
# logging.basicConfig(filename = log_path, filemode = "w", format = '%(levelname)s: %(message)s', level = logging.INFO)
# if args.verbose:
# logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
# Parsing the input file including the configuration of the artifact's
# image:
config_path = args.config
status = ""
config_file = open(config_path, "r")
config = json.loads(config_file.read())
config_file.close()
dl_dir = None
# If not using cache, creates a temporary directory:
if cache_dir == None:
tmp_dir = tempfile.TemporaryDirectory()
dl_dir = tmp_dir.name
else:
use_cache = True
dl_dir = cache_dir
artifact_name = os.path.splitext(os.path.basename(config_path))[0]
artifact_dir = download_sources(config, arthashlog_path, dl_dir, use_cache, artifact_name)
# If download was successful:
if artifact_dir != "":
return_code, build_output = build_image(config, artifact_dir, artifact_name, args.docker_cache)
if return_code == 0:
status = "success"
check_env(config, artifact_dir, artifact_name, pkglist_path)
remove_image(config, artifact_name)
else:
status = builderror_identifier(build_output)
# Creates file if not already:
pathlib.Path(pkglist_path).touch()
# If download failed, we need to save the error to the build status log:
else:
logging.fatal("Artifact could not be downloaded!")
status = "artifact_unavailable"
# except Exception as err:
# # Handles any possible script's own crashes:
# formatted_err = str(''.join(traceback.format_exception(None, err, err.__traceback__)))
# log_file = open(log_path, "a")
# log_file.write(formatted_err)
# log_file.close()
# logging.error(formatted_err)
# status = "script_crash"
buildresult_saver(status, buildstatus_path, config_path)
if __name__ == "__main__":
main()