Fixed aggregation by separating it into multiple rules. Fixed calls to get_analysis_dates. Fixed plot headers and plot script in plotting rules, which are now working (close #44). Rule all has been modified according to these changes. Removed global analysis rule.

List of artifacts to build now generated using all available blacklists, not just the last one (ensures that we don't miss some blacklisted artifacts). Fixed update blacklist rule, because of grep's output. Removed default blacklist.
2024-08-22 17:55:05 +02:00 · 2024-08-22 17:55:05 +02:00 · ce826c35a6
commit ce826c35a6
parent 2d5b043f8f
4 changed files with 129 additions and 80 deletions
--- a/blacklists/blacklist.csv
+++ b/blacklists/blacklist.csv
@ -1,2 +0,0 @@
-template,0,unknown_error
-test,0,unknown_error
--- a/config/config.yaml
+++ b/config/config.yaml
@ -4,6 +4,7 @@ folder_blacklists: "blacklists"
 system: "local"
 prefix: "outputs"
 analysis_dir: "outputs/analysis"
+plot_dir: "outputs/analysis/plot"

 site: "grenoble"
 cluster: "dahu"
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@ -11,21 +11,41 @@ ARTIFACTS_FOLDER_JSON   = config["folder_artifacts_json"]
 BLACKLIST_FOLDER = config["folder_blacklists"]
 EXTENSION = "json"
 SYSTEM = config["system"]
+
 PREFIX = config["prefix"]
 ECG_OUTPUTS = ["pkgs", "build_status", "artifact_hash"]
+SHELLS_ECG = {
+  "local": f"./{{input.ecg_wrapper}} {{input.ecg}} {ARTIFACTS_FOLDER_JSON}/{{wildcards.artifact}}.{EXTENSION} {{output.pkg}} {{output.build_status}} {{output.artifact_hash}} {{output.log}}",
+  "g5k": f"python3 {{input.execo_wrapper}} --path {os.getcwd()} --script {{input.oar_wrapper}} --site {config['site']} --cluster {config['cluster']} --max-duration {config['max_duration']} --checkpoint {config['checkpoint']} {'--besteffort' if config['besteffort'] else ''} --sleep_time {config['sleep_time']} --build_status_file {{output.build_status}} --artifact {{wildcards.artifact}} -- '"
+}

 ANALYSIS_DIR = config["analysis_dir"]
 ANALYSIS_CATS = ["sources_stats", "pkgs_changes", "build_status", "artifact"]
-ANALYSIS_TYPES = ["moment", "long_term"]
+
+PLOT_DIR = config["plot_dir"]
+PLOT_SCRIPT = "plot/plot.r"
+PLOT_HEADERS = {
+  "softenv": "dpkg rpm pacman pip conda git misc",
+  "build_status": "success package_install_failed baseimage_unavailable artifact_unavailable dockerfile_not_found script_crash job_time_exceeded unknown_error",
+  "artifact": "available unavailable changed"
+}

 ARTIFACTS = get_artifacts_to_build(ARTIFACTS_FOLDER_NICKEL, BLACKLIST_FOLDER)

 rule all:
  input:
-    expand(f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv",
-      cat = ANALYSIS_CATS,
+    expand(f"{ANALYSIS_DIR}/{{analysis_cat}}/plot/line/{{date}}.pdf",
+      analysis_cat = ANALYSIS_CATS,
      date = DATE
    ),
+    expand(f"{ANALYSIS_DIR}/{{analysis_cat}}/plot/bar/{{date}}.pdf",
+      analysis_cat = ["sources_stats", "build_status", "artifact"],
+      date = DATE
+    ),
+    # expand(f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv",
+    #   cat = ANALYSIS_CATS,
+    #   date = DATE
+    # ),
    # expand(f"{PREFIX}/{{folder}}/{{artifact}}/{{date}}.csv",
    #        folder=["pkgs", "build_status", "artifact_hash"],
    #        artifact=ARTIFACTS,
@ -54,11 +74,6 @@ rule check_artifact:
    nickel export --format json --output {output} <<< 'let {{Artifact, ..}} = import "{input.contract}" in ((import "{input.artifact}") | Artifact)'
    """

-SHELLS_ECG = {
-  "local": f"./{{input.ecg_wrapper}} {{input.ecg}} {ARTIFACTS_FOLDER_JSON}/{{wildcards.artifact}}.{EXTENSION} {{output.pkg}} {{output.build_status}} {{output.artifact_hash}} {{output.log}}",
-  "g5k": f"python3 {{input.execo_wrapper}} --path {os.getcwd()} --script {{input.oar_wrapper}} --site {config['site']} --cluster {config['cluster']} --max-duration {config['max_duration']} --checkpoint {config['checkpoint']} {'--besteffort' if config['besteffort'] else ''} --sleep_time {config['sleep_time']} --build_status_file {{output.build_status}} --artifact {{wildcards.artifact}} -- '"
-}
-
 rule run_ecg:
  input:
    "flake.nix",
@ -85,15 +100,16 @@ rule update_blacklist:
    f"{BLACKLIST_FOLDER}/{{date}}.csv"
  shell:
    # We need to ignore lines where build is successful:
-    f"cat {{input}} | grep -v ',success' > {{output}}"
+    f"cat {{input}} | grep -v ',success' > {{output}} || true"

-rule analysis:
-  input:
-    expand(f"{PREFIX}/{{output_dir}}/{{artifact}}/{{date}}.csv",
-      output_dir = ECG_OUTPUTS,
-      artifact = ARTIFACTS,
-      date = get_analysis_dates("{PREFIX}/{wildcards.output_dir}")
-    )
+# rule analysis:
+#   input:
+#     expand(f"{PREFIX}/{{output_dir}}/{{artifact}}/{{date}}.csv",
+#       output_dir = ECG_OUTPUTS,
+#       artifact = ARTIFACTS,
+#       # date = get_analysis_dates("{PREFIX}/{output_dir}")
+#       date = glob_wildcards("{PREFIX}/{output_dir}/{artifact}/{date}.csv").date
+#     )

 rule softenv_analysis:
  wildcard_constraints:
@ -105,7 +121,7 @@ rule softenv_analysis:
    ),
    pkgs_changes = expand(f"{PREFIX}/pkgs/{{artifact}}/{{date}}.csv",
      artifact = ARTIFACTS,
-      date = get_analysis_dates("{PREFIX}/pkgs")
+      date = get_analysis_dates(f"{PREFIX}/pkgs")
    )
  output:
    sources_stats = f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv",
@ -137,7 +153,7 @@ rule artifact_analysis:
  input:
    expand(f"{PREFIX}/artifact_hash/{{artifact}}/{{date}}.csv",
      artifact = ARTIFACTS,
-      date = get_analysis_dates("{PREFIX}/artifact_hash")
+      date = get_analysis_dates(f"{PREFIX}/artifact_hash")
    )
  output:
    f"{ANALYSIS_DIR}/artifact/{{date}}.csv",
@ -148,68 +164,100 @@ rule artifact_analysis:

 rule analysis_aggregate:
  input:
-    expand(f"{ANALYSIS_DIR}/{{input_cat}}/{{date}}.csv",
-      input_cat = ANALYSIS_CATS,
-      date = get_analysis_dates("{ANALYSIS_DIR}/{wildcards.cat}")
-    )
-    # sources_stats = expand(f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv", date = glob_wildcards(f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv").date),
-    # pkgs_changes = expand(f"{ANALYSIS_DIR}/pkgs_changes/{{date}}.csv", date = glob_wildcards(f"{ANALYSIS_DIR}/pkgs_changes/{{date}}.csv").date),
-    # build_status = expand(f"{ANALYSIS_DIR}/build_status/{{date}}.csv", date = glob_wildcards(f"{ANALYSIS_DIR}/build_status/{{date}}.csv").date),
-    # artifact = expand(f"{ANALYSIS_DIR}/artifact/{{date}}.csv", date = glob_wildcards(f"{ANALYSIS_DIR}/artifact/{{date}}.csv").date)
-  output:
-    expand(f"{ANALYSIS_DIR}/{{output_cat}}/aggregated/{{{{date}}}}.csv",
-      output_cat = ANALYSIS_CATS
-    )
-  shell:
-    expand(f"cat {ANALYSIS_DIR}/{{cat}}/{{{{date}}}}.csv > {ANALYSIS_DIR}/{{cat}}/aggregated/{{{{date}}}}.csv",
-      cat = ANALYSIS_CATS
+    expand(f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv",
+      cat = ANALYSIS_CATS,
+      date = DATE
    )

-PLOT_HEADERS = {
-  "sources_stats": ["dpkg", "rpm", "pacman", "pip", "conda", "git", "misc"],
-  "pkgs_changes": ["dpkg", "rpm", "pacman", "pip", "conda", "git", "misc"],
-  "build_status": ["success", "package_install_failed", "baseimage_unavailable", "artifact_unavailable", "dockerfile_not_found", "script_crash", "job_time_exceeded", "unknown_error"],
-  "artifact": ["available", "unavailable", "changed"]
-}
+# rule single_aggregate:
+#   input:
+#     expand(f"{ANALYSIS_DIR}/{{{{cat}}}}/{{date}}.csv",
+#       date = get_analysis_dates(f"{ANALYSIS_DIR}/{{wildcards.cat}}")
+#       # date = glob_wildcards("{ANALYSIS_DIR}/{cat}/{date}.csv").date
+#     )
+#   output:
+#     f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv"
+#   shell:
+#     f"cat {{input}} > {{output}}"
+
+rule pkgschgs_aggregate:
+  input:
+    expand(f"{ANALYSIS_DIR}/pkgs_changes/{{date}}.csv",
+      date = get_analysis_dates(f"{ANALYSIS_DIR}/pkgs_changes")
+    )
+  output:
+    f"{ANALYSIS_DIR}/pkgs_changes/aggregated/{{date}}.csv"
+  shell:
+    f"cat {{input}} > {{output}}"
+
+rule srcsstats_aggregate:
+  input:
+    expand(f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv",
+      date = get_analysis_dates(f"{ANALYSIS_DIR}/sources_stats")
+    )
+  output:
+    f"{ANALYSIS_DIR}/sources_stats/aggregated/{{date}}.csv"
+  shell:
+    f"cat {{input}} > {{output}}"
+
+rule artifact_aggregate:
+  input:
+    expand(f"{ANALYSIS_DIR}/artifact/{{date}}.csv",
+      date = get_analysis_dates(f"{ANALYSIS_DIR}/artifact")
+    )
+  output:
+    f"{ANALYSIS_DIR}/artifact/aggregated/{{date}}.csv"
+  shell:
+    f"cat {{input}} > {{output}}"
+
+rule buildstatus_aggregate:
+  input:
+    expand(f"{ANALYSIS_DIR}/build_status/{{date}}.csv",
+      date = get_analysis_dates(f"{ANALYSIS_DIR}/build_status")
+    )
+  output:
+    f"{ANALYSIS_DIR}/build_status/aggregated/{{date}}.csv"
+  shell:
+    f"cat {{input}} > {{output}}"

 rule all_plot:
  input:
-    expand(f"{ANALYSIS_DIR}/{{folder}}/aggregated/{{date}}.csv",
-      folder = ["sources_stats", "pkgs_changes", "build_status", "artifact"],
+    expand(f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv",
+      cat = ANALYSIS_CATS,
      date = DATE
    )

 rule line_plot:
  input:
-    sources_stats = f"{ANALYSIS_DIR}/sources_stats/aggregated/{DATE}.csv",
-    pkgs_changes = f"{ANALYSIS_DIR}/pkgs_changes/aggregated/{DATE}.csv",
-    build_status = f"{ANALYSIS_DIR}/build_status/aggregated/{DATE}.csv",
-    artifact = f"{ANALYSIS_DIR}/artifact/aggregated/{DATE}.csv"
+    sources_stats = f"{ANALYSIS_DIR}/sources_stats/aggregated/{{date}}.csv",
+    pkgs_changes = f"{ANALYSIS_DIR}/pkgs_changes/aggregated/{{date}}.csv",
+    build_status = f"{ANALYSIS_DIR}/build_status/aggregated/{{date}}.csv",
+    artifact = f"{ANALYSIS_DIR}/artifact/aggregated/{{date}}.csv"
  output:
-    sources_stats = f"{ANALYSIS_DIR}/sources_stats/plot/line/{DATE}.csv",
-    pkgs_changes = f"{ANALYSIS_DIR}/pkgs_changes/plot/line/{DATE}.csv",
-    build_status = f"{ANALYSIS_DIR}/build_status/plot/line/{DATE}.csv",
-    artifact = f"{ANALYSIS_DIR}/artifact/plot/line/{DATE}.csv"
+    sources_stats = f"{ANALYSIS_DIR}/sources_stats/plot/line/{{date}}.pdf",
+    pkgs_changes = f"{ANALYSIS_DIR}/pkgs_changes/plot/line/{{date}}.pdf",
+    build_status = f"{ANALYSIS_DIR}/build_status/plot/line/{{date}}.pdf",
+    artifact = f"{ANALYSIS_DIR}/artifact/plot/line/{{date}}.pdf"
  shell:
    f"""
-    Rscript plot.r line {{input.sources_stats}} {{output.sources_stats}} {{{{PLOT_HEADERS["sources_stats"]}}}}
-    Rscript plot.r line {{input.pkgs_changes}} {{output.pkgs_changes}} {{{{PLOT_HEADERS["pkgs_changes"]}}}}
-    Rscript plot.r line {{input.build_status}} {{output.build_status}} {{{{PLOT_HEADERS["build_status"]}}}}
-    Rscript plot.r line {{input.artifact}} {{output.artifact}} {{{{PLOT_HEADERS["artifact"]}}}}
+    Rscript {PLOT_SCRIPT} line {{input.sources_stats}} {{output.sources_stats}} {PLOT_HEADERS["softenv"]} timestamp
+    Rscript {PLOT_SCRIPT} line {{input.pkgs_changes}} {{output.pkgs_changes}} {PLOT_HEADERS["softenv"]} timestamp
+    Rscript {PLOT_SCRIPT} line {{input.build_status}} {{output.build_status}} {PLOT_HEADERS["build_status"]} timestamp
+    Rscript {PLOT_SCRIPT} line {{input.artifact}} {{output.artifact}} {PLOT_HEADERS["artifact"]} timestamp
    """

 rule bar_plot:
  input:
-    sources_stats = f"{ANALYSIS_DIR}/sources_stats/aggregated/{DATE}.csv",
-    build_status = f"{ANALYSIS_DIR}/build_status/aggregated/{DATE}.csv",
-    artifact = f"{ANALYSIS_DIR}/artifact/aggregated/{DATE}.csv"
+    sources_stats = f"{ANALYSIS_DIR}/sources_stats/aggregated/{{date}}.csv",
+    build_status = f"{ANALYSIS_DIR}/build_status/aggregated/{{date}}.csv",
+    artifact = f"{ANALYSIS_DIR}/artifact/aggregated/{{date}}.csv"
  output:
-    sources_stats = f"{ANALYSIS_DIR}/sources_stats/plot/bar/{DATE}.csv",
-    build_status = f"{ANALYSIS_DIR}/build_status/plot/bar/{DATE}.csv",
-    artifact = f"{ANALYSIS_DIR}/artifact/plot/bar/{DATE}.csv"
+    sources_stats = f"{ANALYSIS_DIR}/sources_stats/plot/bar/{{date}}.pdf",
+    build_status = f"{ANALYSIS_DIR}/build_status/plot/bar/{{date}}.pdf",
+    artifact = f"{ANALYSIS_DIR}/artifact/plot/bar/{{date}}.pdf"
  shell:
    f"""
-    Rscript plot.r bar {{input.sources_stats}} {{output.sources_stats}} {{{{PLOT_HEADERS["sources_stats"]}}}}
-    Rscript plot.r bar {{input.build_status}} {{output.build_status}} {{{{PLOT_HEADERS["build_status"]}}}}
-    Rscript plot.r bar {{input.artifact}} {{output.artifact}} {{{{PLOT_HEADERS["artifact"]}}}}
+    Rscript {PLOT_SCRIPT} bar {{input.sources_stats}} {{output.sources_stats}} {PLOT_HEADERS["softenv"]} timestamp
+    Rscript {PLOT_SCRIPT} bar {{input.build_status}} {{output.build_status}} {PLOT_HEADERS["build_status"]} timestamp
+    Rscript {PLOT_SCRIPT} bar {{input.artifact}} {{output.artifact}} {PLOT_HEADERS["artifact"]} timestamp
    """
--- a/workflow/utils.smk
+++ b/workflow/utils.smk
@ -11,22 +11,24 @@ def get_analysis_dates(directory):
        outputs.append(datetime.datetime.now().strftime("%Y%m%d"))
    return outputs

-def find_last_blacklist(blacklist_dir_path):
-    last_blacklist = "0.csv"
-    for blacklist in os.listdir(blacklist_dir_path):
-        if not os.path.isdir(blacklist):
-            # We want the latest one, so the one that has the most recent date
-            # as file name:
-            curbl_date = int(os.path.splitext(blacklist)[0])
-            lastbl_date = int(os.path.splitext(last_blacklist)[0])
-            if curbl_date > lastbl_date:
-                last_blacklist = blacklist
-    return last_blacklist
+# def find_last_blacklist(blacklist_dir_path):
+#     last_blacklist = "0.csv"
+#     for blacklist in os.listdir(blacklist_dir_path):
+#         if not os.path.isdir(blacklist):
+#             # We want the latest one, so the one that has the most recent date
+#             # as file name:
+#             curbl_date = int(os.path.splitext(blacklist)[0])
+#             lastbl_date = int(os.path.splitext(last_blacklist)[0])
+#             if curbl_date > lastbl_date:
+#                 last_blacklist = blacklist
+#     return last_blacklist

 def get_blacklisted(blacklist_dir_path):
    blacklisted = set()
    if os.path.exists(blacklist_dir_path):
-        blacklist_csv_path = os.path.join(blacklist_dir_path, find_last_blacklist(blacklist_dir_path))
+        for blacklist in os.listdir(blacklist_dir_path):
+            if not os.path.isdir(blacklist):
+                blacklist_csv_path = os.path.join(blacklist_dir_path, blacklist)
                with open(blacklist_csv_path, "r") as csv_file:
                    spamreader = csv.reader(csv_file, delimiter=",")
                    for row in spamreader: