Fixed aggregation by separating it into multiple rules. Fixed calls to get_analysis_dates. Fixed plot headers and plot script in plotting rules, which are now working (close #44). Rule all has been modified according to these changes. Removed global analysis rule.

List of artifacts to build now generated using all available blacklists, not just the last one (ensures that we don't miss some blacklisted artifacts). Fixed update blacklist rule, because of grep's output. Removed default blacklist.
This commit is contained in:
antux18 2024-08-22 17:55:05 +02:00
parent 2d5b043f8f
commit ce826c35a6
4 changed files with 129 additions and 80 deletions

View File

@ -1,2 +0,0 @@
template,0,unknown_error
test,0,unknown_error
1 template 0 unknown_error
2 test 0 unknown_error

View File

@ -4,6 +4,7 @@ folder_blacklists: "blacklists"
system: "local" system: "local"
prefix: "outputs" prefix: "outputs"
analysis_dir: "outputs/analysis" analysis_dir: "outputs/analysis"
plot_dir: "outputs/analysis/plot"
site: "grenoble" site: "grenoble"
cluster: "dahu" cluster: "dahu"

View File

@ -11,21 +11,41 @@ ARTIFACTS_FOLDER_JSON = config["folder_artifacts_json"]
BLACKLIST_FOLDER = config["folder_blacklists"] BLACKLIST_FOLDER = config["folder_blacklists"]
EXTENSION = "json" EXTENSION = "json"
SYSTEM = config["system"] SYSTEM = config["system"]
PREFIX = config["prefix"] PREFIX = config["prefix"]
ECG_OUTPUTS = ["pkgs", "build_status", "artifact_hash"] ECG_OUTPUTS = ["pkgs", "build_status", "artifact_hash"]
SHELLS_ECG = {
"local": f"./{{input.ecg_wrapper}} {{input.ecg}} {ARTIFACTS_FOLDER_JSON}/{{wildcards.artifact}}.{EXTENSION} {{output.pkg}} {{output.build_status}} {{output.artifact_hash}} {{output.log}}",
"g5k": f"python3 {{input.execo_wrapper}} --path {os.getcwd()} --script {{input.oar_wrapper}} --site {config['site']} --cluster {config['cluster']} --max-duration {config['max_duration']} --checkpoint {config['checkpoint']} {'--besteffort' if config['besteffort'] else ''} --sleep_time {config['sleep_time']} --build_status_file {{output.build_status}} --artifact {{wildcards.artifact}} -- '"
}
ANALYSIS_DIR = config["analysis_dir"] ANALYSIS_DIR = config["analysis_dir"]
ANALYSIS_CATS = ["sources_stats", "pkgs_changes", "build_status", "artifact"] ANALYSIS_CATS = ["sources_stats", "pkgs_changes", "build_status", "artifact"]
ANALYSIS_TYPES = ["moment", "long_term"]
PLOT_DIR = config["plot_dir"]
PLOT_SCRIPT = "plot/plot.r"
PLOT_HEADERS = {
"softenv": "dpkg rpm pacman pip conda git misc",
"build_status": "success package_install_failed baseimage_unavailable artifact_unavailable dockerfile_not_found script_crash job_time_exceeded unknown_error",
"artifact": "available unavailable changed"
}
ARTIFACTS = get_artifacts_to_build(ARTIFACTS_FOLDER_NICKEL, BLACKLIST_FOLDER) ARTIFACTS = get_artifacts_to_build(ARTIFACTS_FOLDER_NICKEL, BLACKLIST_FOLDER)
rule all: rule all:
input: input:
expand(f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv", expand(f"{ANALYSIS_DIR}/{{analysis_cat}}/plot/line/{{date}}.pdf",
cat = ANALYSIS_CATS, analysis_cat = ANALYSIS_CATS,
date = DATE date = DATE
), ),
expand(f"{ANALYSIS_DIR}/{{analysis_cat}}/plot/bar/{{date}}.pdf",
analysis_cat = ["sources_stats", "build_status", "artifact"],
date = DATE
),
# expand(f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv",
# cat = ANALYSIS_CATS,
# date = DATE
# ),
# expand(f"{PREFIX}/{{folder}}/{{artifact}}/{{date}}.csv", # expand(f"{PREFIX}/{{folder}}/{{artifact}}/{{date}}.csv",
# folder=["pkgs", "build_status", "artifact_hash"], # folder=["pkgs", "build_status", "artifact_hash"],
# artifact=ARTIFACTS, # artifact=ARTIFACTS,
@ -54,11 +74,6 @@ rule check_artifact:
nickel export --format json --output {output} <<< 'let {{Artifact, ..}} = import "{input.contract}" in ((import "{input.artifact}") | Artifact)' nickel export --format json --output {output} <<< 'let {{Artifact, ..}} = import "{input.contract}" in ((import "{input.artifact}") | Artifact)'
""" """
SHELLS_ECG = {
"local": f"./{{input.ecg_wrapper}} {{input.ecg}} {ARTIFACTS_FOLDER_JSON}/{{wildcards.artifact}}.{EXTENSION} {{output.pkg}} {{output.build_status}} {{output.artifact_hash}} {{output.log}}",
"g5k": f"python3 {{input.execo_wrapper}} --path {os.getcwd()} --script {{input.oar_wrapper}} --site {config['site']} --cluster {config['cluster']} --max-duration {config['max_duration']} --checkpoint {config['checkpoint']} {'--besteffort' if config['besteffort'] else ''} --sleep_time {config['sleep_time']} --build_status_file {{output.build_status}} --artifact {{wildcards.artifact}} -- '"
}
rule run_ecg: rule run_ecg:
input: input:
"flake.nix", "flake.nix",
@ -85,15 +100,16 @@ rule update_blacklist:
f"{BLACKLIST_FOLDER}/{{date}}.csv" f"{BLACKLIST_FOLDER}/{{date}}.csv"
shell: shell:
# We need to ignore lines where build is successful: # We need to ignore lines where build is successful:
f"cat {{input}} | grep -v ',success' > {{output}}" f"cat {{input}} | grep -v ',success' > {{output}} || true"
rule analysis: # rule analysis:
input: # input:
expand(f"{PREFIX}/{{output_dir}}/{{artifact}}/{{date}}.csv", # expand(f"{PREFIX}/{{output_dir}}/{{artifact}}/{{date}}.csv",
output_dir = ECG_OUTPUTS, # output_dir = ECG_OUTPUTS,
artifact = ARTIFACTS, # artifact = ARTIFACTS,
date = get_analysis_dates("{PREFIX}/{wildcards.output_dir}") # # date = get_analysis_dates("{PREFIX}/{output_dir}")
) # date = glob_wildcards("{PREFIX}/{output_dir}/{artifact}/{date}.csv").date
# )
rule softenv_analysis: rule softenv_analysis:
wildcard_constraints: wildcard_constraints:
@ -105,7 +121,7 @@ rule softenv_analysis:
), ),
pkgs_changes = expand(f"{PREFIX}/pkgs/{{artifact}}/{{date}}.csv", pkgs_changes = expand(f"{PREFIX}/pkgs/{{artifact}}/{{date}}.csv",
artifact = ARTIFACTS, artifact = ARTIFACTS,
date = get_analysis_dates("{PREFIX}/pkgs") date = get_analysis_dates(f"{PREFIX}/pkgs")
) )
output: output:
sources_stats = f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv", sources_stats = f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv",
@ -137,7 +153,7 @@ rule artifact_analysis:
input: input:
expand(f"{PREFIX}/artifact_hash/{{artifact}}/{{date}}.csv", expand(f"{PREFIX}/artifact_hash/{{artifact}}/{{date}}.csv",
artifact = ARTIFACTS, artifact = ARTIFACTS,
date = get_analysis_dates("{PREFIX}/artifact_hash") date = get_analysis_dates(f"{PREFIX}/artifact_hash")
) )
output: output:
f"{ANALYSIS_DIR}/artifact/{{date}}.csv", f"{ANALYSIS_DIR}/artifact/{{date}}.csv",
@ -148,68 +164,100 @@ rule artifact_analysis:
rule analysis_aggregate: rule analysis_aggregate:
input: input:
expand(f"{ANALYSIS_DIR}/{{input_cat}}/{{date}}.csv", expand(f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv",
input_cat = ANALYSIS_CATS, cat = ANALYSIS_CATS,
date = get_analysis_dates("{ANALYSIS_DIR}/{wildcards.cat}") date = DATE
)
# sources_stats = expand(f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv", date = glob_wildcards(f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv").date),
# pkgs_changes = expand(f"{ANALYSIS_DIR}/pkgs_changes/{{date}}.csv", date = glob_wildcards(f"{ANALYSIS_DIR}/pkgs_changes/{{date}}.csv").date),
# build_status = expand(f"{ANALYSIS_DIR}/build_status/{{date}}.csv", date = glob_wildcards(f"{ANALYSIS_DIR}/build_status/{{date}}.csv").date),
# artifact = expand(f"{ANALYSIS_DIR}/artifact/{{date}}.csv", date = glob_wildcards(f"{ANALYSIS_DIR}/artifact/{{date}}.csv").date)
output:
expand(f"{ANALYSIS_DIR}/{{output_cat}}/aggregated/{{{{date}}}}.csv",
output_cat = ANALYSIS_CATS
)
shell:
expand(f"cat {ANALYSIS_DIR}/{{cat}}/{{{{date}}}}.csv > {ANALYSIS_DIR}/{{cat}}/aggregated/{{{{date}}}}.csv",
cat = ANALYSIS_CATS
) )
PLOT_HEADERS = { # rule single_aggregate:
"sources_stats": ["dpkg", "rpm", "pacman", "pip", "conda", "git", "misc"], # input:
"pkgs_changes": ["dpkg", "rpm", "pacman", "pip", "conda", "git", "misc"], # expand(f"{ANALYSIS_DIR}/{{{{cat}}}}/{{date}}.csv",
"build_status": ["success", "package_install_failed", "baseimage_unavailable", "artifact_unavailable", "dockerfile_not_found", "script_crash", "job_time_exceeded", "unknown_error"], # date = get_analysis_dates(f"{ANALYSIS_DIR}/{{wildcards.cat}}")
"artifact": ["available", "unavailable", "changed"] # # date = glob_wildcards("{ANALYSIS_DIR}/{cat}/{date}.csv").date
} # )
# output:
# f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv"
# shell:
# f"cat {{input}} > {{output}}"
rule pkgschgs_aggregate:
input:
expand(f"{ANALYSIS_DIR}/pkgs_changes/{{date}}.csv",
date = get_analysis_dates(f"{ANALYSIS_DIR}/pkgs_changes")
)
output:
f"{ANALYSIS_DIR}/pkgs_changes/aggregated/{{date}}.csv"
shell:
f"cat {{input}} > {{output}}"
rule srcsstats_aggregate:
input:
expand(f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv",
date = get_analysis_dates(f"{ANALYSIS_DIR}/sources_stats")
)
output:
f"{ANALYSIS_DIR}/sources_stats/aggregated/{{date}}.csv"
shell:
f"cat {{input}} > {{output}}"
rule artifact_aggregate:
input:
expand(f"{ANALYSIS_DIR}/artifact/{{date}}.csv",
date = get_analysis_dates(f"{ANALYSIS_DIR}/artifact")
)
output:
f"{ANALYSIS_DIR}/artifact/aggregated/{{date}}.csv"
shell:
f"cat {{input}} > {{output}}"
rule buildstatus_aggregate:
input:
expand(f"{ANALYSIS_DIR}/build_status/{{date}}.csv",
date = get_analysis_dates(f"{ANALYSIS_DIR}/build_status")
)
output:
f"{ANALYSIS_DIR}/build_status/aggregated/{{date}}.csv"
shell:
f"cat {{input}} > {{output}}"
rule all_plot: rule all_plot:
input: input:
expand(f"{ANALYSIS_DIR}/{{folder}}/aggregated/{{date}}.csv", expand(f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv",
folder = ["sources_stats", "pkgs_changes", "build_status", "artifact"], cat = ANALYSIS_CATS,
date = DATE date = DATE
) )
rule line_plot: rule line_plot:
input: input:
sources_stats = f"{ANALYSIS_DIR}/sources_stats/aggregated/{DATE}.csv", sources_stats = f"{ANALYSIS_DIR}/sources_stats/aggregated/{{date}}.csv",
pkgs_changes = f"{ANALYSIS_DIR}/pkgs_changes/aggregated/{DATE}.csv", pkgs_changes = f"{ANALYSIS_DIR}/pkgs_changes/aggregated/{{date}}.csv",
build_status = f"{ANALYSIS_DIR}/build_status/aggregated/{DATE}.csv", build_status = f"{ANALYSIS_DIR}/build_status/aggregated/{{date}}.csv",
artifact = f"{ANALYSIS_DIR}/artifact/aggregated/{DATE}.csv" artifact = f"{ANALYSIS_DIR}/artifact/aggregated/{{date}}.csv"
output: output:
sources_stats = f"{ANALYSIS_DIR}/sources_stats/plot/line/{DATE}.csv", sources_stats = f"{ANALYSIS_DIR}/sources_stats/plot/line/{{date}}.pdf",
pkgs_changes = f"{ANALYSIS_DIR}/pkgs_changes/plot/line/{DATE}.csv", pkgs_changes = f"{ANALYSIS_DIR}/pkgs_changes/plot/line/{{date}}.pdf",
build_status = f"{ANALYSIS_DIR}/build_status/plot/line/{DATE}.csv", build_status = f"{ANALYSIS_DIR}/build_status/plot/line/{{date}}.pdf",
artifact = f"{ANALYSIS_DIR}/artifact/plot/line/{DATE}.csv" artifact = f"{ANALYSIS_DIR}/artifact/plot/line/{{date}}.pdf"
shell: shell:
f""" f"""
Rscript plot.r line {{input.sources_stats}} {{output.sources_stats}} {{{{PLOT_HEADERS["sources_stats"]}}}} Rscript {PLOT_SCRIPT} line {{input.sources_stats}} {{output.sources_stats}} {PLOT_HEADERS["softenv"]} timestamp
Rscript plot.r line {{input.pkgs_changes}} {{output.pkgs_changes}} {{{{PLOT_HEADERS["pkgs_changes"]}}}} Rscript {PLOT_SCRIPT} line {{input.pkgs_changes}} {{output.pkgs_changes}} {PLOT_HEADERS["softenv"]} timestamp
Rscript plot.r line {{input.build_status}} {{output.build_status}} {{{{PLOT_HEADERS["build_status"]}}}} Rscript {PLOT_SCRIPT} line {{input.build_status}} {{output.build_status}} {PLOT_HEADERS["build_status"]} timestamp
Rscript plot.r line {{input.artifact}} {{output.artifact}} {{{{PLOT_HEADERS["artifact"]}}}} Rscript {PLOT_SCRIPT} line {{input.artifact}} {{output.artifact}} {PLOT_HEADERS["artifact"]} timestamp
""" """
rule bar_plot: rule bar_plot:
input: input:
sources_stats = f"{ANALYSIS_DIR}/sources_stats/aggregated/{DATE}.csv", sources_stats = f"{ANALYSIS_DIR}/sources_stats/aggregated/{{date}}.csv",
build_status = f"{ANALYSIS_DIR}/build_status/aggregated/{DATE}.csv", build_status = f"{ANALYSIS_DIR}/build_status/aggregated/{{date}}.csv",
artifact = f"{ANALYSIS_DIR}/artifact/aggregated/{DATE}.csv" artifact = f"{ANALYSIS_DIR}/artifact/aggregated/{{date}}.csv"
output: output:
sources_stats = f"{ANALYSIS_DIR}/sources_stats/plot/bar/{DATE}.csv", sources_stats = f"{ANALYSIS_DIR}/sources_stats/plot/bar/{{date}}.pdf",
build_status = f"{ANALYSIS_DIR}/build_status/plot/bar/{DATE}.csv", build_status = f"{ANALYSIS_DIR}/build_status/plot/bar/{{date}}.pdf",
artifact = f"{ANALYSIS_DIR}/artifact/plot/bar/{DATE}.csv" artifact = f"{ANALYSIS_DIR}/artifact/plot/bar/{{date}}.pdf"
shell: shell:
f""" f"""
Rscript plot.r bar {{input.sources_stats}} {{output.sources_stats}} {{{{PLOT_HEADERS["sources_stats"]}}}} Rscript {PLOT_SCRIPT} bar {{input.sources_stats}} {{output.sources_stats}} {PLOT_HEADERS["softenv"]} timestamp
Rscript plot.r bar {{input.build_status}} {{output.build_status}} {{{{PLOT_HEADERS["build_status"]}}}} Rscript {PLOT_SCRIPT} bar {{input.build_status}} {{output.build_status}} {PLOT_HEADERS["build_status"]} timestamp
Rscript plot.r bar {{input.artifact}} {{output.artifact}} {{{{PLOT_HEADERS["artifact"]}}}} Rscript {PLOT_SCRIPT} bar {{input.artifact}} {{output.artifact}} {PLOT_HEADERS["artifact"]} timestamp
""" """

View File

@ -11,26 +11,28 @@ def get_analysis_dates(directory):
outputs.append(datetime.datetime.now().strftime("%Y%m%d")) outputs.append(datetime.datetime.now().strftime("%Y%m%d"))
return outputs return outputs
def find_last_blacklist(blacklist_dir_path): # def find_last_blacklist(blacklist_dir_path):
last_blacklist = "0.csv" # last_blacklist = "0.csv"
for blacklist in os.listdir(blacklist_dir_path): # for blacklist in os.listdir(blacklist_dir_path):
if not os.path.isdir(blacklist): # if not os.path.isdir(blacklist):
# We want the latest one, so the one that has the most recent date # # We want the latest one, so the one that has the most recent date
# as file name: # # as file name:
curbl_date = int(os.path.splitext(blacklist)[0]) # curbl_date = int(os.path.splitext(blacklist)[0])
lastbl_date = int(os.path.splitext(last_blacklist)[0]) # lastbl_date = int(os.path.splitext(last_blacklist)[0])
if curbl_date > lastbl_date: # if curbl_date > lastbl_date:
last_blacklist = blacklist # last_blacklist = blacklist
return last_blacklist # return last_blacklist
def get_blacklisted(blacklist_dir_path): def get_blacklisted(blacklist_dir_path):
blacklisted = set() blacklisted = set()
if os.path.exists(blacklist_dir_path): if os.path.exists(blacklist_dir_path):
blacklist_csv_path = os.path.join(blacklist_dir_path, find_last_blacklist(blacklist_dir_path)) for blacklist in os.listdir(blacklist_dir_path):
with open(blacklist_csv_path, "r") as csv_file: if not os.path.isdir(blacklist):
spamreader = csv.reader(csv_file, delimiter=",") blacklist_csv_path = os.path.join(blacklist_dir_path, blacklist)
for row in spamreader: with open(blacklist_csv_path, "r") as csv_file:
blacklisted.add(row[0]) spamreader = csv.reader(csv_file, delimiter=",")
for row in spamreader:
blacklisted.add(row[0])
return blacklisted return blacklisted
def get_artifacts_to_build(artifacts_folder, blacklist_dir_path): def get_artifacts_to_build(artifacts_folder, blacklist_dir_path):