study-docker-repro-longevity/workflow/Snakefile

172 lines
6.3 KiB
Plaintext

configfile: "config/config.yaml"
include: "utils.smk"
import os
import datetime
DATE = datetime.datetime.now().strftime("%Y%m%d")
ARTIFACTS_FOLDER_NICKEL = config["folder_artifacts_nickel"]
ARTIFACTS_FOLDER_JSON = config["folder_artifacts_json"]
BLACKLIST_FOLDER = config["folder_blacklists"]
EXTENSION = "json"
SYSTEM = config["system"]
PREFIX = config["prefix"]
ANALYSIS_DIR = config["analysis_dir"]
ARTIFACTS = get_artifacts_to_build(ARTIFACTS_FOLDER_NICKEL, BLACKLIST_FOLDER)
rule all:
input:
expand(f"{PREFIX}/{{folder}}/{{artifact}}/{{date}}.csv",\
folder=["pkgs", "build_status", "artifact_hash"],\
artifact=ARTIFACTS,\
date=DATE
),
expand(f"{PREFIX}/{{folder}}/{{artifact}}/{{date}}.txt",\
folder=["logs"],\
artifact=ARTIFACTS,\
date=DATE
),
f"{BLACKLIST_FOLDER}/{DATE}.csv"
rule check_all:
input:
expand(f"{ARTIFACTS_FOLDER_JSON}/{{artifact}}.json", artifact=ARTIFACTS)
rule check_artifact:
input:
"flake.nix",
"flake.lock",
contract="workflow/nickel/artifact_contract.ncl",
artifact=f"{ARTIFACTS_FOLDER_NICKEL}/{{artifact}}.ncl"
output:
f"{ARTIFACTS_FOLDER_JSON}/{{artifact}}.json"
shell:
"""
nickel export --format json --output {output} <<< 'let {{Artifact, ..}} = import "{input.contract}" in ((import "{input.artifact}") | Artifact)'
"""
SHELLS_ECG = {
"local": f"./{{input.ecg_wrapper}} {{input.ecg}} {ARTIFACTS_FOLDER_JSON}/{{wildcards.artifact}}.{EXTENSION} {{output.pkg}} {{output.build_status}} {{output.artifact_hash}} {{output.log}}",
"g5k": f"python3 {{input.execo_wrapper}} --path {os.getcwd()} --script {{input.oar_wrapper}} --site {config['site']} --cluster {config['cluster']} --max-duration {config['max_duration']} --checkpoint {config['checkpoint']} {'--besteffort' if config['besteffort'] else ''} --sleep_time {config['sleep_time']} --build_status_file {{output.build_status}} --artifact {{wildcards.artifact}} -- '"
}
rule run_ecg:
input:
"flake.nix",
"flake.lock",
ecg="ecg.py",
ecg_wrapper="workflow/scripts/ecg_wrapper.sh",
execo_wrapper="workflow/scripts/submission_g5k.py",
oar_wrapper="workflow/scripts/ecg_oar_wrapper.oar.bash",
artifact=f"{ARTIFACTS_FOLDER_JSON}/{{artifact}}.{EXTENSION}"
output:
log = f"{PREFIX}/logs/{{artifact}}/{{date}}.txt",
pkg = f"{PREFIX}/pkgs/{{artifact}}/{{date}}.csv",
build_status = f"{PREFIX}/build_status/{{artifact}}/{{date}}.csv",
artifact_hash = f"{PREFIX}/artifact_hash/{{artifact}}/{{date}}.csv"
shell:
(SHELLS_ECG["g5k"] if SYSTEM == "g5k" else "") + SHELLS_ECG["local"] + ("'" if SYSTEM == "g5k" else "")
rule update_blacklist:
input:
build_status=expand(f"{PREFIX}/build_status/{{artifact}}/{{{{date}}}}.csv",\
artifact=ARTIFACTS
)
output:
f"{BLACKLIST_FOLDER}/{{date}}.csv"
shell:
# We need to ignore lines where build is successful:
f"cat {{input}} | grep -v ',success' > {{output}}"
rule analysis:
input:
expand(f"{PREFIX}/{{folder}}/{{artifact}}/{{{{date}}}}.csv",\
folder = ["pkgs", "build_status", "artifact_hash"],\
artifact = ARTIFACTS
)
output:
expand(f"{ANALYSIS_DIR}/{{folder}}/{{date}}.csv",\
folder = ["sources_stats", "pkgs_changes", "build_status", "artifact_hash"],\
date = DATE
)
rule softenv_analysis:
input:
expand(f"{PREFIX}/pkgs/{{artifact}}/{{{{date}}}}.csv",\
artifact = ARTIFACTS
)
output:
sources_stats = f"{ANALYSIS_DIR}/sources_stats/{DATE}.csv",\
pkgs_changes = f"{ANALYSIS_DIR}/pkgs_changes/{DATE}.csv"
shell:
f"python3 analysis/softenv_analysis.py -t sources-stats -i {{input}} -o {{output.sources_stats}}"
f"python3 analysis/softenv_analysis.py -t pkgs-changes -i {{input}} -o {{output.pkgs_changes}}"
rule buildstatus_analysis:
input:
expand(f"{PREFIX}/build_status/{{artifact}}/{{{{date}}}}.csv",\
artifact = ARTIFACTS
)
output:
f"{ANALYSIS_DIR}/build_status/{DATE}.csv"
shell:
f"python3 analysis/buildstatus_analysis.py -i {{input}} -o {{output}}"
rule artifact_analysis:
input:
expand(f"{PREFIX}/artifact_hash/{{artifact}}/{{{{date}}}}.csv",\
artifact = ARTIFACTS
)
output:
f"{ANALYSIS_DIR}/artifact/{DATE}.csv"
shell:
f"python3 analysis/artifact_analysis.py -i {{input}} -o {{output}}"
rule analysis_aggregate:
input:
sources_stats = expand(f"{ANALYSIS_DIR}/sources_stats/{{{{date}}}}.csv"),
pkgs_changes = expand(f"{ANALYSIS_DIR}/pkgs_changes/{{{{date}}}}.csv"),
build_status = expand(f"{ANALYSIS_DIR}/build_status/{{{{date}}}}.csv"),
artifact = expand(f"{ANALYSIS_DIR}/artifact/{{{{date}}}}.csv")
output:
sources_stats = f"{ANALYSIS_DIR}/sources_stats/aggregated/{DATE}.csv",
pkgs_changes = f"{ANALYSIS_DIR}/pkgs_changes/aggregated/{DATE}.csv",
build_status = f"{ANALYSIS_DIR}/build_status/aggregated/{DATE}.csv",
artifact = f"{ANALYSIS_DIR}/artifact/aggregated/{DATE}.csv"
shell:
f"cat {{input.sources_stats}} > {{output.sources_stats}}"
f"cat {{input.pkgs_changes}} > {{output.pkgs_changes}}"
f"cat {{input.build_status}} > {{output.build_status}}"
f"cat {{input.artifact}} > {{output.artifact}}"
# PLOT_HEADERS = {
# "sources_stats": ["dpkg", "rpm", "pacman", "pip", "conda", "git", "misc"],
# "pkgs_changes": ["dpkg", "rpm", "pacman", "pip", "conda", "git", "misc"],
# "build_status": ["success", "package_install_failed", "baseimage_unavailable", "artifact_unavailable", "dockerfile_not_found", "script_crash", "job_time_exceeded", "unknown_error"],
# "artifact": ["available", "unavailable", "changed"]
# }
# rule plot_all:
# input:
# expand(f"{ANALYSIS_DIR}/{{folder}}/aggregated/{{date}}.csv",\
# folder = ["sources_stats", "pkgs_changes", "build_status", "artifact"],\
# date = DATE
# )
# rule line_plot:
# input:
# expand(f"{ANALYSIS_DIR}/{{folder}}/{{artifact}}/{{date}}.csv",\
# folder = ["sources_stats", "pkgs_changes", "build_status", "artifact"],\
# artifact = ARTIFACTS,\
# date = DATE
# ),
# output:
# expand(f"{ANALYSIS_DIR}/{{folder}}/line.pdf",\
# folder = ["sources_stats", "pkgs_changes", "build_status", "artifact"],\
# artifact = ARTIFACTS,\
# date = DATE
# ),
# shell:
# f"Rscript plot.r line {{{{PLOT_HEADERS[wildcards.folder]}}}}"