big cleaning of ecg. probably does not work right now: to be tested

This commit is contained in:
Quentin Guilloteau 2024-08-29 10:22:51 +02:00
parent 258cd64e40
commit 817ec821c5

336
ecg.py
View File

@ -22,228 +22,75 @@ import datetime
import sys import sys
import string import string
import traceback import traceback
import hashlib
def trim(url): def download_file_and_get_hash(url, dest_path):
"""
Trims given URL to make it contain only lowercase letters and numbers,
as well as with a maximum length of 128.
Parameters
----------
url: str
URL to trim.
Returns
-------
str
Trimmed URL.
"""
trimmed = ""
url_lc = url.lower()
i = 0
while i < len(url_lc) and i < 128:
c = url_lc[i]
if c in string.ascii_lowercase or c in [str(x) for x in range(0, 10)]:
trimmed += c
i += 1
return trimmed
def download_file(url, dest):
"""
Downloads the file stored at the given URL and returns its hash
and location.
Parameters
----------
url: str
URL to the file to download.
dest: str
Path to where the file should be stored.
Returns
-------
str
Hash of the downloaded file, or empty string if download failed.
"""
file_hash = "-1" file_hash = "-1"
try: try:
req = requests.get(url) req = requests.get(url)
if req.status_code != 404: if req.status_code != 404:
file = open(dest, "wb") with open(dest_path, "wb") a file:
file.write(req.content) file.write(req.content)
file.close() file_hash = hashlib.sha256(req.content).hexdigest()
hash_process = subprocess.run(f"sha256sum {file.name} | cut -d ' ' -f 1 | tr -d '\n'", capture_output=True, shell=True)
file_hash = hash_process.stdout.decode("utf-8")
except requests.exceptions.ConnectionError: except requests.exceptions.ConnectionError:
# We can just ignore this exception, as we will just return an empty # We can just ignore this exception, as we will just return an empty
# hash to indicate the error: # hash to indicate the error:
pass pass
return file_hash return file_hash
def download_sources(config, arthashlog_path, dl_dir, use_cache, artifact_name): def download_sources(url, archive_type, arthashlog_path, dl_dir, artifact_name):
"""
Downloads the source of the artifact in 'config'.
Parameters
----------
config: dict
Parsed config file.
arthashlog_path: str
Path to the artifact hash log file.
dl_dir: str
Path to the directory where to download the artifact.
use_cache: bool
Indicates whether the cache should be used or not.
artifact_name: str
Name of the artifact, for the artifact hash log.
Returns
-------
temp_dir: str
Path to the directory where the artifact is downloaded to, or empty
string if download failed.
"""
url = config["artifact_url"]
artcache_dir = trim(url)
artifact_dir = os.path.join(dl_dir, artcache_dir)
# Checking if artifact in cache. Not downloading if it is:
if not os.path.exists(artifact_dir) or not use_cache:
logging.info(f"Downloading artifact from {url}") logging.info(f"Downloading artifact from {url}")
# In case cache was used before:
if not use_cache:
os.system(f"rm -rf {artifact_dir}")
os.mkdir(artifact_dir)
artifact_file = tempfile.NamedTemporaryFile()
artifact_path = artifact_file.name
artifact_hash = download_file(url, artifact_path)
# If download was successful:
if artifact_hash != "-1":
if config["type"] == "zip":
artifact = zipfile.ZipFile(artifact_path)
elif config["type"] == "tar":
artifact = tarfile.open(artifact_path)
logging.info(f"Extracting artifact at {artifact_dir}")
artifact.extractall(artifact_dir)
# If download failed:
else:
os.rmdir(artifact_dir)
artifact_dir = "" artifact_dir = ""
# Logging the current hash of the artifact:
arthashlog_file = open(arthashlog_path, "a") tmp_artifact_file = tempfile.NamedTemporaryFile()
tmp_artifact_path = artifact_file.name
artifact_hash = download_file_and_get_hash(url, tmp_artifact_path)
if artifact_hash != "-1":
logging.info(f"Extracting artifact at {artifact_dir}")
artcache_dir = f"ecg_{artifact_hash[:9]}"
artifact_dir = os.path.join(dl_dir, artcache_dir)
extractors = {
"zip": zipfile.ZipFile,
"tar": tarfile.open
}
os.mkdir(artifact_dir)
extractors[archive_type](artifact_path).extractall(artifact_dir)
with open(arthashlog_path, "w") as arthashlog_file:
now = datetime.datetime.now() now = datetime.datetime.now()
timestamp = str(datetime.datetime.timestamp(now)) timestamp = str(datetime.datetime.timestamp(now))
# Artifact hash will be an empty string if download failed:
arthashlog_file.write(f"{timestamp},{artifact_hash},{artifact_name}\n") arthashlog_file.write(f"{timestamp},{artifact_hash},{artifact_name}\n")
arthashlog_file.close()
else:
logging.info(f"Cache found for {url}, skipping download")
return artifact_dir return artifact_dir
def builderror_identifier(output): def builderror_identifier(output):
"""
Parses the given 'output' to indentify the error.
Parameters
----------
output: str
Output of Docker.
Returns
-------
found_error: str
The error that has been found in the output, according to the
categories. If there is more than one, only the latest is taken into
account.
"""
# Possible error messages given by 'docker build' and their category.
# The key is the category, the value is a tuple of error messages belonging to
# to this category:
build_errors = { build_errors = {
"package_install_failed":("Unable to locate package", "error: failed to compile"), "package_install_failed": ("Unable to locate package", "error: failed to compile"),
"baseimage_unavailable":("manifest unknown: manifest unknown",), "baseimage_unavailable": ("manifest unknown: manifest unknown",),
"dockerfile_not_found":("Dockerfile: no such file or directory",) "dockerfile_not_found": ("Dockerfile: no such file or directory",)
} }
# Last error found is the right one in theory:
found_error = ""
unknown_error = True
for error_cat, error_msgs in build_errors.items(): for error_cat, error_msgs in build_errors.items():
for error in error_msgs: for error in error_msgs:
if error in output: if error in output:
unknown_error = False return error_cat
found_error = error_cat return "unknown_error"
if unknown_error:
found_error = "unknown_error"
return found_error
def buildresult_saver(result, buildstatus_path, config_path): def buildresult_saver(result, buildstatus_path, config_path):
""" with open(buildstatus_path, "w") as buildstatus_file:
Saves the given result in the 'build_status' file.
Parameters
----------
result: str
The result of the build. Either a Docker 'build' error
(see 'builderror_identifier'), another type of error
(for instance 'artifact_unavailable'), or 'success'
if build is successful.
buildstatus_path: str
Path to the build status file.
config_path: str
Path to the config file.
Returns
-------
None
"""
buildstatus_file = open(buildstatus_path, "a")
artifact_name = os.path.basename(config_path).split(".")[0] artifact_name = os.path.basename(config_path).split(".")[0]
now = datetime.datetime.now() now = datetime.datetime.now()
timestamp = str(datetime.datetime.timestamp(now)) timestamp = str(datetime.datetime.timestamp(now))
buildstatus_file.write(f"{artifact_name},{timestamp},{result}\n") buildstatus_file.write(f"{artifact_name},{timestamp},{result}\n")
buildstatus_file.close()
def build_image(config, src_dir, image_name, docker_cache = False): def build_image(path, image_name):
"""
Builds the given Docker image in 'config'.
Parameters
----------
config: dict
Parsed config file.
src_dir: str
Path to the directory where the artifact is stored.
image_name: str
Name of the Docker image.
docker_cache: bool
Enables or disables Docker 'build' cache.
Returns
-------
return_code: bool, build_output: str
Return code and output of Docker 'build'.
"""
cache_arg = " --no-cache"
if docker_cache:
cache_arg = ""
logging.info(f"Starting building image {image_name}") logging.info(f"Starting building image {image_name}")
path = os.path.join(src_dir, config["buildfile_dir"]) path = os.path.join(src_dir, config["buildfile_dir"])
# Using trimmed artifact URL as name: build_command = f"docker build --no-cache -t {image_name} ."
build_command = f"docker build{cache_arg} -t {image_name} ."
build_process = subprocess.run(build_command.split(" "), cwd=path, capture_output=True) build_process = subprocess.run(build_command.split(" "), cwd=path, capture_output=True)
build_output = f"stdout:\n{build_process.stdout.decode('utf-8')}\nstderr:\n{build_process.stderr.decode('utf-8')}" build_output = f"stdout:\n{build_process.stdout.decode('utf-8')}\nstderr:\n{build_process.stderr.decode('utf-8')}"
logging.info(f"Output of '{build_command}':") logging.info(f"Output of '{build_command}':\n\n{build_output}")
logging.info(build_output)
return_code = build_process.returncode return_code = build_process.returncode
logging.info(f"Command '{build_command}' exited with code {return_code}") logging.info(f"Command '{build_command}' exited with code {return_code}")
return return_code, build_output return return_code, build_output
@ -289,11 +136,21 @@ def check_env(config, src_dir, artifact_name, pkglist_path):
# host, to take into account container images that do not have the formatting # host, to take into account container images that do not have the formatting
# packages installed. # packages installed.
pkgmgr_cmd = { pkgmgr_cmd = {
"dpkg": ("dpkg", "-l", "awk 'NR>5 {print $2 \",\" $3 \",dpkg," + artifact_name + "," + timestamp + "\"}'"), \ "dpkg": ("dpkg",\
"rpm":("rpm", "-qa --queryformat '%{NAME},%{VERSION},rpm," + artifact_name + "," + timestamp + "\\n'", ""), \ "-l",\
"pacman":("pacman", "-Q", "awk '{print $0 \",\" $1 \",pacman," + artifact_name + "," + timestamp + "\"}'"), \ f"awk 'NR>5 {{print $2 \",\" $3 \",dpkg,{artifact_name},{timestamp}\"}}'"), \
"pip":("pip", "list", "awk 'NR>2 {print $1 \",\" $2 \",\" \"pip," + artifact_name + "," + timestamp + "\"}'"), \ "rpm":("rpm",\
"conda":("/root/.conda/bin/conda", "list -e", "sed 's/=/ /g' | awk 'NR>3 {print $1 \",\" $2 \",conda," + artifact_name + "," + timestamp + "\"}'") f"-qa --queryformat '%{{NAME}},%{{VERSION}},rpm,{artifact_name},{timestamp}\\n'",\
""), \
"pacman":("pacman",\
"-Q",\
f"awk '{{print $0 \",\" $1 \",pacman,{artifact_name},{timestamp}\"}}'"), \
"pip":("pip",\
"list",\
f"awk 'NR>2 {{print $1 \",\" $2 \",\" \"pip,{artifact_name},{timestamp}\"}}'"), \
"conda":("/root/.conda/bin/conda",\
"list -e",\
f"sed 's/=/ /g' | awk 'NR>3 {{print $1 \",\" $2 \",conda,{artifact_name},{timestamp}\"}}'")
} }
# Command to obtain the latest commit hash in a git repository (separated # Command to obtain the latest commit hash in a git repository (separated
# into 2 parts for "--entrypoint"): # into 2 parts for "--entrypoint"):
@ -301,7 +158,6 @@ def check_env(config, src_dir, artifact_name, pkglist_path):
logging.info("Checking software environment") logging.info("Checking software environment")
pkglist_file = open(pkglist_path, "w") pkglist_file = open(pkglist_path, "w")
# pkglist_file.write("package,version,package_manager\n")
path = os.path.join(src_dir, config["buildfile_dir"]) path = os.path.join(src_dir, config["buildfile_dir"])
# Package managers: # Package managers:
for pkgmgr in config["package_managers"]: for pkgmgr in config["package_managers"]:
@ -312,11 +168,11 @@ def check_env(config, src_dir, artifact_name, pkglist_path):
pkglist_cmdargs = pkgmgr_cmd[pkgmgr][1].split(" ") pkglist_cmdargs = pkgmgr_cmd[pkgmgr][1].split(" ")
listformat_cmd = pkgmgr_cmd[pkgmgr][2] listformat_cmd = pkgmgr_cmd[pkgmgr][2]
logging.info(f"Checking '{pkgmgr}'") logging.info(f"Checking '{pkgmgr}'")
# pkglist_process = subprocess.run(["docker", "run", "--rm", config["image_name"]] + pkglist_cmd.split(" "), cwd=path, capture_output=True)
pkglist_process = subprocess.run(["docker", "run", "--rm", "--entrypoint", pkglist_cmd, artifact_name] + pkglist_cmdargs, cwd=path, capture_output=True) pkglist_process = subprocess.run(["docker", "run", "--rm", "--entrypoint", pkglist_cmd, artifact_name] + pkglist_cmdargs, cwd=path, capture_output=True)
format_process = subprocess.run(f"cat << EOF | {listformat_cmd}\n{pkglist_process.stdout.decode('utf-8')}EOF", cwd=path, capture_output=True, shell=True) format_process = subprocess.run(f"cat << EOF | {listformat_cmd}\n{pkglist_process.stdout.decode('utf-8')}EOF", cwd=path, capture_output=True, shell=True)
pkglist = format_process.stdout.decode("utf-8") pkglist = format_process.stdout.decode("utf-8")
pkglist_file.write(pkglist) pkglist_file.write(pkglist)
# Python venvs: # Python venvs:
logging.info("Checking Python venvs") logging.info("Checking Python venvs")
for venv in config["python_venvs"]: for venv in config["python_venvs"]:
@ -338,43 +194,16 @@ def check_env(config, src_dir, artifact_name, pkglist_path):
logging.info("Checking miscellaneous packages") logging.info("Checking miscellaneous packages")
for pkg in config["misc_packages"]: for pkg in config["misc_packages"]:
logging.info(f"Downloading package {pkg['name']} from {pkg['url']}") logging.info(f"Downloading package {pkg['name']} from {pkg['url']}")
pkg_file = tempfile.NamedTemporaryFile() with tempfile.NamedTemporaryFile() as pkg_file:
pkg_path = pkg_file.name pkg_hash = download_file_and_get_hash(pkg["url"], pkg_file.name)
pkg_hash = download_file(pkg["url"], pkg_path) pkglist_file.write(f"{pkg['name']},{pkg_hash},misc,{artifact_name},{timestamp}\n")
# Package hash will be an empty string if download failed:
pkg_row = f"{pkg['name']},{pkg_hash},misc,{artifact_name},{timestamp}"
pkglist_file.write(f"{pkg_row}\n")
pkglist_file.close() pkglist_file.close()
def remove_image(config, image_name): def remove_image(image_name):
"""
Removes the Docker image given in 'config'.
Parameters
----------
config: dict
Parsed config file.
image_name: str
Name of the Docker image.
Returns
-------
None
"""
logging.info(f"Removing image '{image_name}'") logging.info(f"Removing image '{image_name}'")
subprocess.run(["docker", "rmi", image_name], capture_output = True) subprocess.run(["docker", "rmi", image_name], capture_output = True)
def main(): def main():
# Paths:
config_path = ""
pkglist_path = "" # Package list being generated
buildstatus_path = "" # Status of the build process of the image, when it fails
arthashlog_path = "" # Log of the hash of the downloaded artifact
cache_dir = "" # Artifact cache directory, when using one. 'None' value indicates no cache.
use_cache = False
# Command line arguments parsing:
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
prog = "ecg", prog = "ecg",
description = description =
@ -402,62 +231,41 @@ def main():
help = "Path to the file where to write the log of the hash of the downloaded artifact.", help = "Path to the file where to write the log of the hash of the downloaded artifact.",
required = True required = True
) )
parser.add_argument(
"-c", "--cache-dir",
help =
"""
Path to the cache directory, where artifacts that are downloaded will be stored for future usage.
If not specified, cache is disabled.
""",
required = False
),
args = parser.parse_args() args = parser.parse_args()
# Setting up the paths of the outputs: logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
pkglist_path = args.pkg_list
buildstatus_path = args.build_status
arthashlog_path = args.artifact_hash
cache_dir = args.cache_dir
# Creating the output files to avoid complaints from Snakemake about missing config_path = args.config
# outputs... with open(config_path, "r") as config_file:
config = json.loads(config_file.read())
artifact_name = os.path.splitext(os.path.basename(config_path))[0]
ecg(artifact_name, config, args.pkg_list, args.build_status, args.artifact_hash)
return 0
def ecg(artifact_name, config, pkglist_path, buildstatus_path, arthashlog_path):
# just in case Snakemake does not create them
pathlib.Path(pkglist_path).touch() pathlib.Path(pkglist_path).touch()
pathlib.Path(buildstatus_path).touch() pathlib.Path(buildstatus_path).touch()
pathlib.Path(arthashlog_path).touch() pathlib.Path(arthashlog_path).touch()
# Setting up the log:
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
# Parsing the input file including the configuration of the artifact's
# image:
config_path = args.config
status = "" status = ""
config_file = open(config_path, "r")
config = json.loads(config_file.read())
config_file.close()
dl_dir = None with tempfile.TemporaryDirectory() as tmp_dir:
# If not using cache, creates a temporary directory:
if cache_dir == None:
tmp_dir = tempfile.TemporaryDirectory()
dl_dir = tmp_dir.name dl_dir = tmp_dir.name
else: artifact_dir = download_sources(config["url"], config["type"], arthashlog_path, dl_dir, artifact_name)
use_cache = True
dl_dir = cache_dir
artifact_name = os.path.splitext(os.path.basename(config_path))[0]
artifact_dir = download_sources(config, arthashlog_path, dl_dir, use_cache, artifact_name)
# If download was successful:
if artifact_dir != "": if artifact_dir != "":
return_code, build_output = build_image(config, artifact_dir, artifact_name, args.docker_cache) path = os.path.join(artifact_dir, config["buildfile_dir"])
return_code, build_output = build_image(path, artifact_name)
if return_code == 0: if return_code == 0:
status = "success" status = "success"
check_env(config, artifact_dir, artifact_name, pkglist_path) check_env(config, artifact_dir, artifact_name, pkglist_path)
remove_image(config, artifact_name) remove_image(artifact_name)
else: else:
status = builderror_identifier(build_output) status = builderror_identifier(build_output)
# Creates file if not already:
pathlib.Path(pkglist_path).touch()
# If download failed, we need to save the error to the build status log:
else: else:
logging.fatal("Artifact could not be downloaded!") logging.fatal("Artifact could not be downloaded!")
status = "artifact_unavailable" status = "artifact_unavailable"