study-docker-repro-longevity/workflow/Snakefile
antux18 ce826c35a6 Fixed aggregation by separating it into multiple rules. Fixed calls to get_analysis_dates. Fixed plot headers and plot script in plotting rules, which are now working (close #44). Rule all has been modified according to these changes. Removed global analysis rule.
List of artifacts to build now generated using all available blacklists, not just the last one (ensures that we don't miss some blacklisted artifacts). Fixed update blacklist rule, because of grep's output. Removed default blacklist.
2024-08-22 17:55:05 +02:00

263 lines
9.1 KiB
Plaintext

configfile: "config/config.yaml"
include: "utils.smk"
import os
import datetime
DATE = datetime.datetime.now().strftime("%Y%m%d")
ARTIFACTS_FOLDER_NICKEL = config["folder_artifacts_nickel"]
ARTIFACTS_FOLDER_JSON = config["folder_artifacts_json"]
BLACKLIST_FOLDER = config["folder_blacklists"]
EXTENSION = "json"
SYSTEM = config["system"]
PREFIX = config["prefix"]
ECG_OUTPUTS = ["pkgs", "build_status", "artifact_hash"]
SHELLS_ECG = {
"local": f"./{{input.ecg_wrapper}} {{input.ecg}} {ARTIFACTS_FOLDER_JSON}/{{wildcards.artifact}}.{EXTENSION} {{output.pkg}} {{output.build_status}} {{output.artifact_hash}} {{output.log}}",
"g5k": f"python3 {{input.execo_wrapper}} --path {os.getcwd()} --script {{input.oar_wrapper}} --site {config['site']} --cluster {config['cluster']} --max-duration {config['max_duration']} --checkpoint {config['checkpoint']} {'--besteffort' if config['besteffort'] else ''} --sleep_time {config['sleep_time']} --build_status_file {{output.build_status}} --artifact {{wildcards.artifact}} -- '"
}
ANALYSIS_DIR = config["analysis_dir"]
ANALYSIS_CATS = ["sources_stats", "pkgs_changes", "build_status", "artifact"]
PLOT_DIR = config["plot_dir"]
PLOT_SCRIPT = "plot/plot.r"
PLOT_HEADERS = {
"softenv": "dpkg rpm pacman pip conda git misc",
"build_status": "success package_install_failed baseimage_unavailable artifact_unavailable dockerfile_not_found script_crash job_time_exceeded unknown_error",
"artifact": "available unavailable changed"
}
ARTIFACTS = get_artifacts_to_build(ARTIFACTS_FOLDER_NICKEL, BLACKLIST_FOLDER)
rule all:
input:
expand(f"{ANALYSIS_DIR}/{{analysis_cat}}/plot/line/{{date}}.pdf",
analysis_cat = ANALYSIS_CATS,
date = DATE
),
expand(f"{ANALYSIS_DIR}/{{analysis_cat}}/plot/bar/{{date}}.pdf",
analysis_cat = ["sources_stats", "build_status", "artifact"],
date = DATE
),
# expand(f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv",
# cat = ANALYSIS_CATS,
# date = DATE
# ),
# expand(f"{PREFIX}/{{folder}}/{{artifact}}/{{date}}.csv",
# folder=["pkgs", "build_status", "artifact_hash"],
# artifact=ARTIFACTS,
# date=DATE
# ),
# expand(f"{PREFIX}/logs/{{artifact}}/{{date}}.txt",
# artifact=ARTIFACTS,
# date=DATE
# ),
f"{BLACKLIST_FOLDER}/{DATE}.csv"
rule check_all:
input:
expand(f"{ARTIFACTS_FOLDER_JSON}/{{artifact}}.json", artifact=ARTIFACTS)
rule check_artifact:
input:
"flake.nix",
"flake.lock",
contract="workflow/nickel/artifact_contract.ncl",
artifact=f"{ARTIFACTS_FOLDER_NICKEL}/{{artifact}}.ncl"
output:
f"{ARTIFACTS_FOLDER_JSON}/{{artifact}}.json"
shell:
"""
nickel export --format json --output {output} <<< 'let {{Artifact, ..}} = import "{input.contract}" in ((import "{input.artifact}") | Artifact)'
"""
rule run_ecg:
input:
"flake.nix",
"flake.lock",
ecg="ecg.py",
ecg_wrapper="workflow/scripts/ecg_wrapper.sh",
execo_wrapper="workflow/scripts/submission_g5k.py",
oar_wrapper="workflow/scripts/ecg_oar_wrapper.oar.bash",
artifact=f"{ARTIFACTS_FOLDER_JSON}/{{artifact}}.{EXTENSION}"
output:
log = f"{PREFIX}/logs/{{artifact}}/{{date}}.txt",
pkg = f"{PREFIX}/pkgs/{{artifact}}/{{date}}.csv",
build_status = f"{PREFIX}/build_status/{{artifact}}/{{date}}.csv",
artifact_hash = f"{PREFIX}/artifact_hash/{{artifact}}/{{date}}.csv"
shell:
(SHELLS_ECG["g5k"] if SYSTEM == "g5k" else "") + SHELLS_ECG["local"] + ("'" if SYSTEM == "g5k" else "")
rule update_blacklist:
input:
build_status=expand(f"{PREFIX}/build_status/{{artifact}}/{{{{date}}}}.csv",
artifact=ARTIFACTS
)
output:
f"{BLACKLIST_FOLDER}/{{date}}.csv"
shell:
# We need to ignore lines where build is successful:
f"cat {{input}} | grep -v ',success' > {{output}} || true"
# rule analysis:
# input:
# expand(f"{PREFIX}/{{output_dir}}/{{artifact}}/{{date}}.csv",
# output_dir = ECG_OUTPUTS,
# artifact = ARTIFACTS,
# # date = get_analysis_dates("{PREFIX}/{output_dir}")
# date = glob_wildcards("{PREFIX}/{output_dir}/{artifact}/{date}.csv").date
# )
rule softenv_analysis:
wildcard_constraints:
date="\d+"
input:
sources_stats = expand(f"{PREFIX}/pkgs/{{artifact}}/{{date}}.csv",
artifact = ARTIFACTS,
date = DATE
),
pkgs_changes = expand(f"{PREFIX}/pkgs/{{artifact}}/{{date}}.csv",
artifact = ARTIFACTS,
date = get_analysis_dates(f"{PREFIX}/pkgs")
)
output:
sources_stats = f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv",
pkgs_changes = f"{ANALYSIS_DIR}/pkgs_changes/{{date}}.csv"
shell:
f"""
python3 analysis/softenv_analysis.py -t sources-stats -i {{input.sources_stats}} -o {{output.sources_stats}}
python3 analysis/softenv_analysis.py -t pkgs-changes -i {{input.pkgs_changes}} -o {{output.pkgs_changes}}
"""
rule buildstatus_analysis:
wildcard_constraints:
date="\d+"
input:
expand(f"{PREFIX}/build_status/{{artifact}}/{{date}}.csv",
artifact = ARTIFACTS,
date = DATE
)
output:
f"{ANALYSIS_DIR}/build_status/{{date}}.csv",
shell:
f"""
python3 analysis/buildstatus_analysis.py -i {{input}} -o {{output}}
"""
rule artifact_analysis:
wildcard_constraints:
date="\d+"
input:
expand(f"{PREFIX}/artifact_hash/{{artifact}}/{{date}}.csv",
artifact = ARTIFACTS,
date = get_analysis_dates(f"{PREFIX}/artifact_hash")
)
output:
f"{ANALYSIS_DIR}/artifact/{{date}}.csv",
shell:
f"""
python3 analysis/artifact_analysis.py -i {{input}} -o {{output}}
"""
rule analysis_aggregate:
input:
expand(f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv",
cat = ANALYSIS_CATS,
date = DATE
)
# rule single_aggregate:
# input:
# expand(f"{ANALYSIS_DIR}/{{{{cat}}}}/{{date}}.csv",
# date = get_analysis_dates(f"{ANALYSIS_DIR}/{{wildcards.cat}}")
# # date = glob_wildcards("{ANALYSIS_DIR}/{cat}/{date}.csv").date
# )
# output:
# f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv"
# shell:
# f"cat {{input}} > {{output}}"
rule pkgschgs_aggregate:
input:
expand(f"{ANALYSIS_DIR}/pkgs_changes/{{date}}.csv",
date = get_analysis_dates(f"{ANALYSIS_DIR}/pkgs_changes")
)
output:
f"{ANALYSIS_DIR}/pkgs_changes/aggregated/{{date}}.csv"
shell:
f"cat {{input}} > {{output}}"
rule srcsstats_aggregate:
input:
expand(f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv",
date = get_analysis_dates(f"{ANALYSIS_DIR}/sources_stats")
)
output:
f"{ANALYSIS_DIR}/sources_stats/aggregated/{{date}}.csv"
shell:
f"cat {{input}} > {{output}}"
rule artifact_aggregate:
input:
expand(f"{ANALYSIS_DIR}/artifact/{{date}}.csv",
date = get_analysis_dates(f"{ANALYSIS_DIR}/artifact")
)
output:
f"{ANALYSIS_DIR}/artifact/aggregated/{{date}}.csv"
shell:
f"cat {{input}} > {{output}}"
rule buildstatus_aggregate:
input:
expand(f"{ANALYSIS_DIR}/build_status/{{date}}.csv",
date = get_analysis_dates(f"{ANALYSIS_DIR}/build_status")
)
output:
f"{ANALYSIS_DIR}/build_status/aggregated/{{date}}.csv"
shell:
f"cat {{input}} > {{output}}"
rule all_plot:
input:
expand(f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv",
cat = ANALYSIS_CATS,
date = DATE
)
rule line_plot:
input:
sources_stats = f"{ANALYSIS_DIR}/sources_stats/aggregated/{{date}}.csv",
pkgs_changes = f"{ANALYSIS_DIR}/pkgs_changes/aggregated/{{date}}.csv",
build_status = f"{ANALYSIS_DIR}/build_status/aggregated/{{date}}.csv",
artifact = f"{ANALYSIS_DIR}/artifact/aggregated/{{date}}.csv"
output:
sources_stats = f"{ANALYSIS_DIR}/sources_stats/plot/line/{{date}}.pdf",
pkgs_changes = f"{ANALYSIS_DIR}/pkgs_changes/plot/line/{{date}}.pdf",
build_status = f"{ANALYSIS_DIR}/build_status/plot/line/{{date}}.pdf",
artifact = f"{ANALYSIS_DIR}/artifact/plot/line/{{date}}.pdf"
shell:
f"""
Rscript {PLOT_SCRIPT} line {{input.sources_stats}} {{output.sources_stats}} {PLOT_HEADERS["softenv"]} timestamp
Rscript {PLOT_SCRIPT} line {{input.pkgs_changes}} {{output.pkgs_changes}} {PLOT_HEADERS["softenv"]} timestamp
Rscript {PLOT_SCRIPT} line {{input.build_status}} {{output.build_status}} {PLOT_HEADERS["build_status"]} timestamp
Rscript {PLOT_SCRIPT} line {{input.artifact}} {{output.artifact}} {PLOT_HEADERS["artifact"]} timestamp
"""
rule bar_plot:
input:
sources_stats = f"{ANALYSIS_DIR}/sources_stats/aggregated/{{date}}.csv",
build_status = f"{ANALYSIS_DIR}/build_status/aggregated/{{date}}.csv",
artifact = f"{ANALYSIS_DIR}/artifact/aggregated/{{date}}.csv"
output:
sources_stats = f"{ANALYSIS_DIR}/sources_stats/plot/bar/{{date}}.pdf",
build_status = f"{ANALYSIS_DIR}/build_status/plot/bar/{{date}}.pdf",
artifact = f"{ANALYSIS_DIR}/artifact/plot/bar/{{date}}.pdf"
shell:
f"""
Rscript {PLOT_SCRIPT} bar {{input.sources_stats}} {{output.sources_stats}} {PLOT_HEADERS["softenv"]} timestamp
Rscript {PLOT_SCRIPT} bar {{input.build_status}} {{output.build_status}} {PLOT_HEADERS["build_status"]} timestamp
Rscript {PLOT_SCRIPT} bar {{input.artifact}} {{output.artifact}} {PLOT_HEADERS["artifact"]} timestamp
"""