From 817ec821c58cbcdacfe6bcb5841291fbdec4f08b Mon Sep 17 00:00:00 2001 From: Quentin Guilloteau Date: Thu, 29 Aug 2024 10:22:51 +0200 Subject: [PATCH] big cleaning of ecg. probably does not work right now: to be tested --- ecg.py | 360 ++++++++++++++------------------------------------------- 1 file changed, 84 insertions(+), 276 deletions(-) diff --git a/ecg.py b/ecg.py index bd76cbb..f745ad9 100755 --- a/ecg.py +++ b/ecg.py @@ -22,228 +22,75 @@ import datetime import sys import string import traceback +import hashlib -def trim(url): - """ - Trims given URL to make it contain only lowercase letters and numbers, - as well as with a maximum length of 128. - - Parameters - ---------- - url: str - URL to trim. - - Returns - ------- - str - Trimmed URL. - """ - trimmed = "" - url_lc = url.lower() - i = 0 - while i < len(url_lc) and i < 128: - c = url_lc[i] - if c in string.ascii_lowercase or c in [str(x) for x in range(0, 10)]: - trimmed += c - i += 1 - return trimmed - -def download_file(url, dest): - """ - Downloads the file stored at the given URL and returns its hash - and location. - - Parameters - ---------- - url: str - URL to the file to download. - dest: str - Path to where the file should be stored. - - Returns - ------- - str - Hash of the downloaded file, or empty string if download failed. - """ +def download_file_and_get_hash(url, dest_path): file_hash = "-1" try: req = requests.get(url) if req.status_code != 404: - file = open(dest, "wb") - file.write(req.content) - file.close() - hash_process = subprocess.run(f"sha256sum {file.name} | cut -d ' ' -f 1 | tr -d '\n'", capture_output=True, shell=True) - file_hash = hash_process.stdout.decode("utf-8") + with open(dest_path, "wb") a file: + file.write(req.content) + file_hash = hashlib.sha256(req.content).hexdigest() except requests.exceptions.ConnectionError: # We can just ignore this exception, as we will just return an empty # hash to indicate the error: pass return file_hash -def download_sources(config, arthashlog_path, dl_dir, use_cache, artifact_name): - """ - Downloads the source of the artifact in 'config'. +def download_sources(url, archive_type, arthashlog_path, dl_dir, artifact_name): + logging.info(f"Downloading artifact from {url}") - Parameters - ---------- - config: dict - Parsed config file. + artifact_dir = "" - arthashlog_path: str - Path to the artifact hash log file. + tmp_artifact_file = tempfile.NamedTemporaryFile() + tmp_artifact_path = artifact_file.name + artifact_hash = download_file_and_get_hash(url, tmp_artifact_path) - dl_dir: str - Path to the directory where to download the artifact. - - use_cache: bool - Indicates whether the cache should be used or not. - - artifact_name: str - Name of the artifact, for the artifact hash log. - - Returns - ------- - temp_dir: str - Path to the directory where the artifact is downloaded to, or empty - string if download failed. - """ - url = config["artifact_url"] - artcache_dir = trim(url) - artifact_dir = os.path.join(dl_dir, artcache_dir) - # Checking if artifact in cache. Not downloading if it is: - if not os.path.exists(artifact_dir) or not use_cache: - logging.info(f"Downloading artifact from {url}") - # In case cache was used before: - if not use_cache: - os.system(f"rm -rf {artifact_dir}") + if artifact_hash != "-1": + logging.info(f"Extracting artifact at {artifact_dir}") + artcache_dir = f"ecg_{artifact_hash[:9]}" + artifact_dir = os.path.join(dl_dir, artcache_dir) + extractors = { + "zip": zipfile.ZipFile, + "tar": tarfile.open + } os.mkdir(artifact_dir) - artifact_file = tempfile.NamedTemporaryFile() - artifact_path = artifact_file.name - artifact_hash = download_file(url, artifact_path) - # If download was successful: - if artifact_hash != "-1": - if config["type"] == "zip": - artifact = zipfile.ZipFile(artifact_path) - elif config["type"] == "tar": - artifact = tarfile.open(artifact_path) - logging.info(f"Extracting artifact at {artifact_dir}") - artifact.extractall(artifact_dir) - # If download failed: - else: - os.rmdir(artifact_dir) - artifact_dir = "" - # Logging the current hash of the artifact: - arthashlog_file = open(arthashlog_path, "a") + extractors[archive_type](artifact_path).extractall(artifact_dir) + + with open(arthashlog_path, "w") as arthashlog_file: now = datetime.datetime.now() timestamp = str(datetime.datetime.timestamp(now)) - # Artifact hash will be an empty string if download failed: arthashlog_file.write(f"{timestamp},{artifact_hash},{artifact_name}\n") - arthashlog_file.close() - else: - logging.info(f"Cache found for {url}, skipping download") + return artifact_dir def builderror_identifier(output): - - """ - Parses the given 'output' to indentify the error. - - Parameters - ---------- - output: str - Output of Docker. - - Returns - ------- - found_error: str - The error that has been found in the output, according to the - categories. If there is more than one, only the latest is taken into - account. - """ - # Possible error messages given by 'docker build' and their category. - # The key is the category, the value is a tuple of error messages belonging to - # to this category: build_errors = { - "package_install_failed":("Unable to locate package", "error: failed to compile"), - "baseimage_unavailable":("manifest unknown: manifest unknown",), - "dockerfile_not_found":("Dockerfile: no such file or directory",) + "package_install_failed": ("Unable to locate package", "error: failed to compile"), + "baseimage_unavailable": ("manifest unknown: manifest unknown",), + "dockerfile_not_found": ("Dockerfile: no such file or directory",) } - - # Last error found is the right one in theory: - found_error = "" - unknown_error = True for error_cat, error_msgs in build_errors.items(): for error in error_msgs: if error in output: - unknown_error = False - found_error = error_cat - if unknown_error: - found_error = "unknown_error" - return found_error + return error_cat + return "unknown_error" def buildresult_saver(result, buildstatus_path, config_path): - """ - Saves the given result in the 'build_status' file. + with open(buildstatus_path, "w") as buildstatus_file: + artifact_name = os.path.basename(config_path).split(".")[0] + now = datetime.datetime.now() + timestamp = str(datetime.datetime.timestamp(now)) + buildstatus_file.write(f"{artifact_name},{timestamp},{result}\n") - Parameters - ---------- - result: str - The result of the build. Either a Docker 'build' error - (see 'builderror_identifier'), another type of error - (for instance 'artifact_unavailable'), or 'success' - if build is successful. - - buildstatus_path: str - Path to the build status file. - - config_path: str - Path to the config file. - - Returns - ------- - None - """ - buildstatus_file = open(buildstatus_path, "a") - artifact_name = os.path.basename(config_path).split(".")[0] - now = datetime.datetime.now() - timestamp = str(datetime.datetime.timestamp(now)) - buildstatus_file.write(f"{artifact_name},{timestamp},{result}\n") - buildstatus_file.close() - -def build_image(config, src_dir, image_name, docker_cache = False): - """ - Builds the given Docker image in 'config'. - - Parameters - ---------- - config: dict - Parsed config file. - - src_dir: str - Path to the directory where the artifact is stored. - - image_name: str - Name of the Docker image. - - docker_cache: bool - Enables or disables Docker 'build' cache. - - Returns - ------- - return_code: bool, build_output: str - Return code and output of Docker 'build'. - """ - cache_arg = " --no-cache" - if docker_cache: - cache_arg = "" +def build_image(path, image_name): logging.info(f"Starting building image {image_name}") path = os.path.join(src_dir, config["buildfile_dir"]) - # Using trimmed artifact URL as name: - build_command = f"docker build{cache_arg} -t {image_name} ." + build_command = f"docker build --no-cache -t {image_name} ." build_process = subprocess.run(build_command.split(" "), cwd=path, capture_output=True) build_output = f"stdout:\n{build_process.stdout.decode('utf-8')}\nstderr:\n{build_process.stderr.decode('utf-8')}" - logging.info(f"Output of '{build_command}':") - logging.info(build_output) + logging.info(f"Output of '{build_command}':\n\n{build_output}") return_code = build_process.returncode logging.info(f"Command '{build_command}' exited with code {return_code}") return return_code, build_output @@ -289,11 +136,21 @@ def check_env(config, src_dir, artifact_name, pkglist_path): # host, to take into account container images that do not have the formatting # packages installed. pkgmgr_cmd = { - "dpkg": ("dpkg", "-l", "awk 'NR>5 {print $2 \",\" $3 \",dpkg," + artifact_name + "," + timestamp + "\"}'"), \ - "rpm":("rpm", "-qa --queryformat '%{NAME},%{VERSION},rpm," + artifact_name + "," + timestamp + "\\n'", ""), \ - "pacman":("pacman", "-Q", "awk '{print $0 \",\" $1 \",pacman," + artifact_name + "," + timestamp + "\"}'"), \ - "pip":("pip", "list", "awk 'NR>2 {print $1 \",\" $2 \",\" \"pip," + artifact_name + "," + timestamp + "\"}'"), \ - "conda":("/root/.conda/bin/conda", "list -e", "sed 's/=/ /g' | awk 'NR>3 {print $1 \",\" $2 \",conda," + artifact_name + "," + timestamp + "\"}'") + "dpkg": ("dpkg",\ + "-l",\ + f"awk 'NR>5 {{print $2 \",\" $3 \",dpkg,{artifact_name},{timestamp}\"}}'"), \ + "rpm":("rpm",\ + f"-qa --queryformat '%{{NAME}},%{{VERSION}},rpm,{artifact_name},{timestamp}\\n'",\ + ""), \ + "pacman":("pacman",\ + "-Q",\ + f"awk '{{print $0 \",\" $1 \",pacman,{artifact_name},{timestamp}\"}}'"), \ + "pip":("pip",\ + "list",\ + f"awk 'NR>2 {{print $1 \",\" $2 \",\" \"pip,{artifact_name},{timestamp}\"}}'"), \ + "conda":("/root/.conda/bin/conda",\ + "list -e",\ + f"sed 's/=/ /g' | awk 'NR>3 {{print $1 \",\" $2 \",conda,{artifact_name},{timestamp}\"}}'") } # Command to obtain the latest commit hash in a git repository (separated # into 2 parts for "--entrypoint"): @@ -301,7 +158,6 @@ def check_env(config, src_dir, artifact_name, pkglist_path): logging.info("Checking software environment") pkglist_file = open(pkglist_path, "w") - # pkglist_file.write("package,version,package_manager\n") path = os.path.join(src_dir, config["buildfile_dir"]) # Package managers: for pkgmgr in config["package_managers"]: @@ -312,11 +168,11 @@ def check_env(config, src_dir, artifact_name, pkglist_path): pkglist_cmdargs = pkgmgr_cmd[pkgmgr][1].split(" ") listformat_cmd = pkgmgr_cmd[pkgmgr][2] logging.info(f"Checking '{pkgmgr}'") - # pkglist_process = subprocess.run(["docker", "run", "--rm", config["image_name"]] + pkglist_cmd.split(" "), cwd=path, capture_output=True) pkglist_process = subprocess.run(["docker", "run", "--rm", "--entrypoint", pkglist_cmd, artifact_name] + pkglist_cmdargs, cwd=path, capture_output=True) format_process = subprocess.run(f"cat << EOF | {listformat_cmd}\n{pkglist_process.stdout.decode('utf-8')}EOF", cwd=path, capture_output=True, shell=True) pkglist = format_process.stdout.decode("utf-8") pkglist_file.write(pkglist) + # Python venvs: logging.info("Checking Python venvs") for venv in config["python_venvs"]: @@ -338,43 +194,16 @@ def check_env(config, src_dir, artifact_name, pkglist_path): logging.info("Checking miscellaneous packages") for pkg in config["misc_packages"]: logging.info(f"Downloading package {pkg['name']} from {pkg['url']}") - pkg_file = tempfile.NamedTemporaryFile() - pkg_path = pkg_file.name - pkg_hash = download_file(pkg["url"], pkg_path) - # Package hash will be an empty string if download failed: - pkg_row = f"{pkg['name']},{pkg_hash},misc,{artifact_name},{timestamp}" - pkglist_file.write(f"{pkg_row}\n") + with tempfile.NamedTemporaryFile() as pkg_file: + pkg_hash = download_file_and_get_hash(pkg["url"], pkg_file.name) + pkglist_file.write(f"{pkg['name']},{pkg_hash},misc,{artifact_name},{timestamp}\n") pkglist_file.close() -def remove_image(config, image_name): - """ - Removes the Docker image given in 'config'. - - Parameters - ---------- - config: dict - Parsed config file. - - image_name: str - Name of the Docker image. - - Returns - ------- - None - """ +def remove_image(image_name): logging.info(f"Removing image '{image_name}'") subprocess.run(["docker", "rmi", image_name], capture_output = True) def main(): - # Paths: - config_path = "" - pkglist_path = "" # Package list being generated - buildstatus_path = "" # Status of the build process of the image, when it fails - arthashlog_path = "" # Log of the hash of the downloaded artifact - cache_dir = "" # Artifact cache directory, when using one. 'None' value indicates no cache. - use_cache = False - - # Command line arguments parsing: parser = argparse.ArgumentParser( prog = "ecg", description = @@ -402,66 +231,45 @@ def main(): help = "Path to the file where to write the log of the hash of the downloaded artifact.", required = True ) - parser.add_argument( - "-c", "--cache-dir", - help = - """ - Path to the cache directory, where artifacts that are downloaded will be stored for future usage. - If not specified, cache is disabled. - """, - required = False - ), args = parser.parse_args() - # Setting up the paths of the outputs: - pkglist_path = args.pkg_list - buildstatus_path = args.build_status - arthashlog_path = args.artifact_hash - cache_dir = args.cache_dir + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) - # Creating the output files to avoid complaints from Snakemake about missing - # outputs... + config_path = args.config + with open(config_path, "r") as config_file: + config = json.loads(config_file.read()) + + artifact_name = os.path.splitext(os.path.basename(config_path))[0] + + ecg(artifact_name, config, args.pkg_list, args.build_status, args.artifact_hash) + + return 0 + +def ecg(artifact_name, config, pkglist_path, buildstatus_path, arthashlog_path): + # just in case Snakemake does not create them pathlib.Path(pkglist_path).touch() pathlib.Path(buildstatus_path).touch() pathlib.Path(arthashlog_path).touch() - # Setting up the log: - logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) - - # Parsing the input file including the configuration of the artifact's - # image: - config_path = args.config status = "" - config_file = open(config_path, "r") - config = json.loads(config_file.read()) - config_file.close() - dl_dir = None - # If not using cache, creates a temporary directory: - if cache_dir == None: - tmp_dir = tempfile.TemporaryDirectory() + with tempfile.TemporaryDirectory() as tmp_dir: dl_dir = tmp_dir.name - else: - use_cache = True - dl_dir = cache_dir - artifact_name = os.path.splitext(os.path.basename(config_path))[0] - artifact_dir = download_sources(config, arthashlog_path, dl_dir, use_cache, artifact_name) - # If download was successful: - if artifact_dir != "": - return_code, build_output = build_image(config, artifact_dir, artifact_name, args.docker_cache) - if return_code == 0: - status = "success" - check_env(config, artifact_dir, artifact_name, pkglist_path) - remove_image(config, artifact_name) + artifact_dir = download_sources(config["url"], config["type"], arthashlog_path, dl_dir, artifact_name) + + if artifact_dir != "": + path = os.path.join(artifact_dir, config["buildfile_dir"]) + return_code, build_output = build_image(path, artifact_name) + if return_code == 0: + status = "success" + check_env(config, artifact_dir, artifact_name, pkglist_path) + remove_image(artifact_name) + else: + status = builderror_identifier(build_output) else: - status = builderror_identifier(build_output) - # Creates file if not already: - pathlib.Path(pkglist_path).touch() - # If download failed, we need to save the error to the build status log: - else: - logging.fatal("Artifact could not be downloaded!") - status = "artifact_unavailable" - buildresult_saver(status, buildstatus_path, config_path) + logging.fatal("Artifact could not be downloaded!") + status = "artifact_unavailable" + buildresult_saver(status, buildstatus_path, config_path) if __name__ == "__main__": main()