import subprocess import json import argparse import tempfile import os import requests import zipfile import tarfile import pathlib import logging import datetime import sys import string import traceback import hashlib def download_file_and_get_hash(url, dest_path): file_hash = "-1" try: req = requests.get(url) if req.status_code != 404: with open(dest_path, "wb") as file: file.write(req.content) file_hash = hashlib.sha256(req.content).hexdigest() except requests.exceptions.ConnectionError: # We can just ignore this exception, as we will just return an empty # hash to indicate the error: pass return file_hash def download_sources(url, archive_type, arthashlog_path, dl_dir, artifact_name): logging.info(f"Downloading artifact from {url}") artifact_dir = "" tmp_artifact_file = tempfile.NamedTemporaryFile() tmp_artifact_path = tmp_artifact_file.name artifact_hash = download_file_and_get_hash(url, tmp_artifact_path) if artifact_hash != "-1": artcache_dir = f"ecg_{artifact_hash[:9]}" artifact_dir = os.path.join(dl_dir, artcache_dir) logging.info(f"Extracting artifact at {artifact_dir}") extractors = { "zip": zipfile.ZipFile, "tar": tarfile.open } os.mkdir(artifact_dir) extractors[archive_type](tmp_artifact_path).extractall(artifact_dir) with open(arthashlog_path, "w") as arthashlog_file: now = datetime.datetime.now() timestamp = str(datetime.datetime.timestamp(now)) arthashlog_file.write(f"{timestamp},{artifact_hash},{artifact_name}\n") return artifact_dir def builderror_identifier(output): build_errors = { "package_install_failed": ("Unable to locate package", "error: failed to compile"), "baseimage_unavailable": ("manifest unknown: manifest unknown",), "dockerfile_not_found": ("Dockerfile: no such file or directory",) } for error_cat, error_msgs in build_errors.items(): for error in error_msgs: if error in output: return error_cat return "unknown_error" def buildresult_saver(result, buildstatus_path, config_path): with open(buildstatus_path, "w") as buildstatus_file: artifact_name = os.path.basename(config_path).split(".")[0] now = datetime.datetime.now() timestamp = str(datetime.datetime.timestamp(now)) buildstatus_file.write(f"{artifact_name},{timestamp},{result}\n") def build_image(path, dockerfile_path, image_name, build_args): logging.info(f"Starting building image {image_name}") build_args_str = " ".join(map(lambda x: f"--build-arg {x}", build_args)) build_command = f"docker build --no-cache -t {image_name} {build_args_str} -f {dockerfile_path} ." build_process = subprocess.run(build_command.split(" "), cwd=path, capture_output=True) build_output = f"stdout:\n{build_process.stdout.decode('utf-8')}\nstderr:\n{build_process.stderr.decode('utf-8')}" logging.info(f"Output of '{build_command}':\n\n{build_output}") return_code = build_process.returncode logging.info(f"Command '{build_command}' exited with code {return_code}") return return_code, build_output def check_env(config, src_dir, artifact_name, pkglist_path): """ Builds a list of all software packages installed in the Docker image given in 'config', depending on the package managers specified in the configuration, then stores it in a CSV file. Parameters ---------- config: dict Parsed config file. src_dir: str Path to the directory where the artifact is stored. artifact_name: str Name of the artifact. Used both as the Docker image name, and for the packages list for tracking purpose during the output analysis. pkglist_path: str Path to the package list file. Returns ------- None """ # Saving the current time to add it to every row: now = datetime.datetime.now() timestamp = str(datetime.datetime.timestamp(now)) # Commands to list installed packages along with their versions and the name # of the package manager, depending on the package managers. # Each package manager is associated with a tuple, the first item being # the package manager's command, the second being the arguments for the # query (they must be separated for the "--entrypoint" argument of Docker # 'run', see below), and the third one being the command that will format # the output of the query command (this one can be an empty string in case # the formatting part is already done using the options of the first command). # The first command needs to be run on the container, and the second on the # host, to take into account container images that do not have the formatting # packages installed. pkgmgr_cmd = { "dpkg": ("dpkg",\ "-l",\ f"awk 'NR>5 {{print $2 \",\" $3 \",dpkg,{artifact_name},{timestamp}\"}}'"), \ "rpm":("rpm",\ f"-qa --queryformat '%{{NAME}},%{{VERSION}},rpm,{artifact_name},{timestamp}\\n'",\ ""), \ "pacman":("pacman",\ "-Q",\ f"awk '{{print $0 \",\" $1 \",pacman,{artifact_name},{timestamp}\"}}'"), \ "pip":("pip",\ "list",\ f"awk 'NR>2 {{print $1 \",\" $2 \",\" \"pip,{artifact_name},{timestamp}\"}}'"), \ "conda":("/root/.conda/bin/conda",\ "list -e",\ f"sed 's/=/ /g' | awk 'NR>3 {{print $1 \",\" $2 \",conda,{artifact_name},{timestamp}\"}}'") } # Command to obtain the latest commit hash in a git repository (separated # into 2 parts for "--entrypoint"): gitcmd = ("git", "log -n 1 --pretty=format:%H") logging.info("Checking software environment") pkglist_file = open(pkglist_path, "w") path = os.path.join(src_dir, config["buildfile_dir"]) # Package managers: for pkgmgr in config["package_managers"]: # "--entrypoint" requires command and arguments to be separated. # This Docker 'run' option is used to prevent the shell from printing # a login message, if any. pkglist_cmd = pkgmgr_cmd[pkgmgr][0] pkglist_cmdargs = pkgmgr_cmd[pkgmgr][1].split(" ") listformat_cmd = pkgmgr_cmd[pkgmgr][2] logging.info(f"Checking '{pkgmgr}'") pkglist_process = subprocess.run(["docker", "run", "--rm", "--entrypoint", pkglist_cmd, artifact_name] + pkglist_cmdargs, cwd=path, capture_output=True) format_process = subprocess.run(f"cat << EOF | {listformat_cmd}\n{pkglist_process.stdout.decode('utf-8')}EOF", cwd=path, capture_output=True, shell=True) pkglist = format_process.stdout.decode("utf-8") pkglist_file.write(pkglist) # Python venvs: logging.info("Checking Python venvs") for venv in config["python_venvs"]: pipcmd = pkgmgr_cmd["pip"][0] pipcmd_args = pkgmgr_cmd["pip"][1] pkglist_process = subprocess.run(["docker", "run", "--rm", "-w", venv["path"], "--entrypoint", venv["path"] + "/bin/" + pipcmd, artifact_name] + pipcmd_args.split(" "), cwd=path, capture_output=True) format_process = subprocess.run(f"cat << EOF | {listformat_cmd}\n{pkglist_process.stdout.decode('utf-8')}EOF", cwd=path, capture_output=True, shell=True) pkglist = format_process.stdout.decode("utf-8") pkglist_file.write(pkglist) # Git packages: logging.info("Checking Git packages") for repo in config["git_packages"]: pkglist_process = subprocess.run(["docker", "run", "--rm", "-w", repo["location"], "--entrypoint", gitcmd[0], artifact_name] + gitcmd[1].split(" "), cwd=path, capture_output=True) repo_row = f"{repo['name']},{pkglist_process.stdout.decode('utf-8')},git,{artifact_name},{timestamp}" pkglist_file.write(f"{repo_row}\n") # Misc packages: logging.info("Checking miscellaneous packages") for pkg in config["misc_packages"]: logging.info(f"Downloading package {pkg['name']} from {pkg['url']}") with tempfile.NamedTemporaryFile() as pkg_file: pkg_hash = download_file_and_get_hash(pkg["url"], pkg_file.name) pkglist_file.write(f"{pkg['name']},{pkg_hash},misc,{artifact_name},{timestamp}\n") pkglist_file.close() def remove_image(image_name): logging.info(f"Removing image '{image_name}'") subprocess.run(["docker", "rmi", image_name], capture_output = True) def main(): parser = argparse.ArgumentParser( prog = "ecg", description = """ ECG is a program that automates software environment checking for scientific artifacts. It is meant to be executed periodically to analyze variations in the software environment of the artifact through time. """ ) parser.add_argument( "config", help = "The path to the configuration file of the artifact's Docker image." ) parser.add_argument( "-p", "--pkg-list", help = "Path to the file where the package list generated by the program should be written.", required = True ) parser.add_argument( "-b", "--build-status", help = "Path to the file where to write the build status of the Docker image given in the configuration file.", required = True ) parser.add_argument( "-a", "--artifact-hash", help = "Path to the file where to write the log of the hash of the downloaded artifact.", required = True ) parser.add_argument( "-l", "--log", help = "Path to the file where to write the log of the execution of ecg.", required = True ) args = parser.parse_args() logging.basicConfig(filename=args.log, encoding='utf-8', format='%(levelname)s: %(message)s', level=logging.INFO) config_path = args.config artifact_name = os.path.splitext(os.path.basename(config_path))[0] ecg(artifact_name, config_path, args.pkg_list, args.build_status, args.artifact_hash) return 0 def ecg(artifact_name, config_path, pkglist_path, buildstatus_path, arthashlog_path): # just in case Snakemake does not create them pathlib.Path(pkglist_path).touch() pathlib.Path(buildstatus_path).touch() pathlib.Path(arthashlog_path).touch() with open(config_path, "r") as config_file: config = json.loads(config_file.read()) status = "" with tempfile.TemporaryDirectory() as dl_dir: artifact_dir = download_sources(config["artifact_url"], config["type"], arthashlog_path, dl_dir, artifact_name) if artifact_dir != "": path = os.path.join(artifact_dir, config["buildfile_dir"]) return_code, build_output = build_image(path, config["dockerfile_path"], artifact_name, config["build_args"]) if return_code == 0: status = "success" check_env(config, artifact_dir, artifact_name, pkglist_path) remove_image(artifact_name) else: status = builderror_identifier(build_output) else: logging.fatal("Artifact could not be downloaded!") status = "artifact_unavailable" buildresult_saver(status, buildstatus_path, config_path) if __name__ == "__main__": main()