2024-09-12 14:07:19 +02:00

274 lines
11 KiB
Python
Executable File

import subprocess
import json
import argparse
import tempfile
import os
import requests
import zipfile
import tarfile
import pathlib
import logging
import datetime
import sys
import string
import traceback
import hashlib
def download_file_and_get_hash(url, dest_path):
file_hash = "-1"
try:
req = requests.get(url)
if req.status_code != 404:
with open(dest_path, "wb") as file:
file.write(req.content)
file_hash = hashlib.sha256(req.content).hexdigest()
except requests.exceptions.ConnectionError:
# We can just ignore this exception, as we will just return an empty
# hash to indicate the error:
pass
return file_hash
def download_sources(url, archive_type, arthashlog_path, dl_dir, artifact_name):
logging.info(f"Downloading artifact from {url}")
artifact_dir = ""
tmp_artifact_file = tempfile.NamedTemporaryFile()
tmp_artifact_path = tmp_artifact_file.name
artifact_hash = download_file_and_get_hash(url, tmp_artifact_path)
if artifact_hash != "-1":
artcache_dir = f"ecg_{artifact_hash[:9]}"
artifact_dir = os.path.join(dl_dir, artcache_dir)
logging.info(f"Extracting artifact at {artifact_dir}")
extractors = {
"zip": zipfile.ZipFile,
"tar": tarfile.open
}
os.mkdir(artifact_dir)
extractors[archive_type](tmp_artifact_path).extractall(artifact_dir)
with open(arthashlog_path, "w") as arthashlog_file:
now = datetime.datetime.now()
timestamp = str(datetime.datetime.timestamp(now))
arthashlog_file.write(f"{timestamp},{artifact_hash},{artifact_name}\n")
return artifact_dir
def builderror_identifier(output):
build_errors = {
"package_install_failed": ("Unable to locate package", "error: failed to compile"),
"baseimage_unavailable": ("manifest unknown: manifest unknown",),
"dockerfile_not_found": ("Dockerfile: no such file or directory",)
}
for error_cat, error_msgs in build_errors.items():
for error in error_msgs:
if error in output:
return error_cat
return "unknown_error"
def buildresult_saver(result, buildstatus_path, config_path):
with open(buildstatus_path, "w") as buildstatus_file:
artifact_name = os.path.basename(config_path).split(".")[0]
now = datetime.datetime.now()
timestamp = str(datetime.datetime.timestamp(now))
buildstatus_file.write(f"{artifact_name},{timestamp},{result}\n")
def build_image(path, dockerfile_path, image_name, build_args):
logging.info(f"Starting building image {image_name}")
build_command = f"docker build --no-cache -t {image_name} -f {dockerfile_path}"
if len(build_args) != 0:
build_args_str = " ".join(map(lambda x: f"--build-arg {x}", build_args))
build_command += build_args_str
build_command += " ."
build_process = subprocess.run(build_command.split(" "), cwd=path, capture_output=True)
build_output = f"stdout:\n{build_process.stdout.decode('utf-8')}\nstderr:\n{build_process.stderr.decode('utf-8')}"
logging.info(f"Output of '{build_command}':\n\n{build_output}")
return_code = build_process.returncode
logging.info(f"Command '{build_command}' exited with code {return_code}")
return return_code, build_output
def check_env(config, src_dir, artifact_name, pkglist_path):
"""
Builds a list of all software packages installed in the
Docker image given in 'config', depending on the package managers
specified in the configuration, then stores it in a CSV file.
Parameters
----------
config: dict
Parsed config file.
src_dir: str
Path to the directory where the artifact is stored.
artifact_name: str
Name of the artifact. Used both as the Docker image name, and for the
packages list for tracking purpose during the output analysis.
pkglist_path: str
Path to the package list file.
Returns
-------
None
"""
# Saving the current time to add it to every row:
now = datetime.datetime.now()
timestamp = str(datetime.datetime.timestamp(now))
# Commands to list installed packages along with their versions and the name
# of the package manager, depending on the package managers.
# Each package manager is associated with a tuple, the first item being
# the package manager's command, the second being the arguments for the
# query (they must be separated for the "--entrypoint" argument of Docker
# 'run', see below), and the third one being the command that will format
# the output of the query command (this one can be an empty string in case
# the formatting part is already done using the options of the first command).
# The first command needs to be run on the container, and the second on the
# host, to take into account container images that do not have the formatting
# packages installed.
pkgmgr_cmd = {
"dpkg": ("dpkg",\
"-l",\
f"awk 'NR>5 {{print $2 \",\" $3 \",dpkg,{artifact_name},{timestamp}\"}}'"), \
"rpm":("rpm",\
f"-qa --queryformat '%{{NAME}},%{{VERSION}},rpm,{artifact_name},{timestamp}\\n'",\
""), \
"pacman":("pacman",\
"-Q",\
f"awk '{{print $0 \",\" $1 \",pacman,{artifact_name},{timestamp}\"}}'"), \
"pip":("pip",\
"list",\
f"awk 'NR>2 {{print $1 \",\" $2 \",\" \"pip,{artifact_name},{timestamp}\"}}'"), \
"conda":("/root/.conda/bin/conda",\
"list -e",\
f"sed 's/=/ /g' | awk 'NR>3 {{print $1 \",\" $2 \",conda,{artifact_name},{timestamp}\"}}'")
}
# Command to obtain the latest commit hash in a git repository (separated
# into 2 parts for "--entrypoint"):
gitcmd = ("git", "-c safe.directory=* log -n 1 --pretty=format:%H")
logging.info("Checking software environment")
pkglist_file = open(pkglist_path, "w")
path = os.path.join(src_dir, config["buildfile_dir"])
# Package managers:
for pkgmgr in config["package_managers"]:
# "--entrypoint" requires command and arguments to be separated.
# This Docker 'run' option is used to prevent the shell from printing
# a login message, if any.
pkglist_cmd = pkgmgr_cmd[pkgmgr][0]
pkglist_cmdargs = pkgmgr_cmd[pkgmgr][1].split(" ")
listformat_cmd = pkgmgr_cmd[pkgmgr][2]
logging.info(f"Checking '{pkgmgr}'")
pkglist_process = subprocess.run(["docker", "run", "--rm", "--entrypoint", pkglist_cmd, artifact_name] + pkglist_cmdargs, cwd=path, capture_output=True)
format_process = subprocess.run(f"cat << EOF | {listformat_cmd}\n{pkglist_process.stdout.decode('utf-8')}EOF", cwd=path, capture_output=True, shell=True)
pkglist = format_process.stdout.decode("utf-8")
pkglist_file.write(pkglist)
# Python venvs:
logging.info("Checking Python venvs")
for venv in config["python_venvs"]:
pipcmd = pkgmgr_cmd["pip"][0]
pipcmd_args = pkgmgr_cmd["pip"][1]
pkglist_process = subprocess.run(["docker", "run", "--rm", "-w", venv["path"], "--entrypoint", venv["path"] + "/bin/" + pipcmd, artifact_name] + pipcmd_args.split(" "), cwd=path, capture_output=True)
format_process = subprocess.run(f"cat << EOF | {listformat_cmd}\n{pkglist_process.stdout.decode('utf-8')}EOF", cwd=path, capture_output=True, shell=True)
pkglist = format_process.stdout.decode("utf-8")
pkglist_file.write(pkglist)
# Git packages:
logging.info("Checking Git packages")
for repo in config["git_packages"]:
pkglist_process = subprocess.run(["docker", "run", "--rm", "-w", repo["location"], "--entrypoint", gitcmd[0], artifact_name] + gitcmd[1].split(" "), cwd=path, capture_output=True)
repo_row = f"{repo['name']},{pkglist_process.stdout.decode('utf-8')},git,{artifact_name},{timestamp}"
pkglist_file.write(f"{repo_row}\n")
# Misc packages:
logging.info("Checking miscellaneous packages")
for pkg in config["misc_packages"]:
logging.info(f"Downloading package {pkg['name']} from {pkg['url']}")
with tempfile.NamedTemporaryFile() as pkg_file:
pkg_hash = download_file_and_get_hash(pkg["url"], pkg_file.name)
pkglist_file.write(f"{pkg['name']},{pkg_hash},misc,{artifact_name},{timestamp}\n")
pkglist_file.close()
def remove_image(image_name):
logging.info(f"Removing image '{image_name}'")
subprocess.run(["docker", "rmi", image_name], capture_output = True)
def main():
parser = argparse.ArgumentParser(
prog = "ecg",
description =
"""
ECG is a program that automates software environment checking for scientific artifacts.
It is meant to be executed periodically to analyze variations in the software environment of the artifact through time.
"""
)
parser.add_argument(
"config",
help = "The path to the configuration file of the artifact's Docker image."
)
parser.add_argument(
"-p", "--pkg-list",
help = "Path to the file where the package list generated by the program should be written.",
required = True
)
parser.add_argument(
"-b", "--build-status",
help = "Path to the file where to write the build status of the Docker image given in the configuration file.",
required = True
)
parser.add_argument(
"-a", "--artifact-hash",
help = "Path to the file where to write the log of the hash of the downloaded artifact.",
required = True
)
parser.add_argument(
"-l", "--log",
help = "Path to the file where to write the log of the execution of ecg.",
required = True
)
args = parser.parse_args()
logging.basicConfig(filename=args.log, encoding='utf-8', format='%(levelname)s: %(message)s', level=logging.INFO)
config_path = args.config
artifact_name = os.path.splitext(os.path.basename(config_path))[0]
ecg(artifact_name, config_path, args.pkg_list, args.build_status, args.artifact_hash)
return 0
def ecg(artifact_name, config_path, pkglist_path, buildstatus_path, arthashlog_path):
# just in case Snakemake does not create them
pathlib.Path(pkglist_path).touch()
pathlib.Path(buildstatus_path).touch()
pathlib.Path(arthashlog_path).touch()
with open(config_path, "r") as config_file:
config = json.loads(config_file.read())
status = ""
with tempfile.TemporaryDirectory() as dl_dir:
artifact_dir = download_sources(config["artifact_url"], config["type"], arthashlog_path, dl_dir, artifact_name)
if artifact_dir != "":
path = os.path.join(artifact_dir, config["buildfile_dir"])
return_code, build_output = build_image(path, config["dockerfile_path"], artifact_name, config["build_args"])
if return_code == 0:
status = "success"
check_env(config, artifact_dir, artifact_name, pkglist_path)
remove_image(artifact_name)
else:
status = builderror_identifier(build_output)
else:
logging.fatal("Artifact could not be downloaded!")
status = "artifact_unavailable"
buildresult_saver(status, buildstatus_path, config_path)
if __name__ == "__main__":
main()