The results of the get_analysis_dates function being used as inputs in some Snakemake rules, this causes Snakemake to overwrite older analysis, probably because the files used to generate these older analysis have been modified. To avoid overwriting older analysis, we only specify today's analysis in the input, and use a bash wrapper script to both fetch the older analysis by itself (by giving it the folder where to look), and run the analysis with the fetched files. So I removed get_analysis_dates and replaced it with the wrapper script every time it was used.

I also removed the older unfinished analysis wrapper that I forgot it existed...
2024-08-23 16:14:46 +02:00 · 2024-08-23 16:14:46 +02:00 · 97447e59a1
commit 97447e59a1
parent 514e186c3d
5 changed files with 92 additions and 108 deletions
--- a/analysis/analysis_wrapper.sh
+++ b/analysis/analysis_wrapper.sh
@ -1,18 +0,0 @@
-#!/bin/bash
-
-ANALYSIS_TYPE=$1
-OUTPUT=$2
-shift; shift
-
-ARGS=$@
-INPUT=("${ARGS[@]/#/-i }")
-SCRIPT=""
-OPT=""
-
-case ANALYSIS_TYPE in
-    "softenv")
-        SCRIPT="softenv_analysis.py"
-        OPT="-t sources-stats"
-    ;;
-
-python3 softenv_analysis.py -t sources-stats $INPUT -o
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@ -21,6 +21,9 @@ SHELLS_ECG = {

 ANALYSIS_DIR = config["analysis_dir"]
 ANALYSIS_CATS = ["sources_stats", "pkgs_changes", "build_status", "artifact"]
+ANALYSIS_SCRIPTS_DIR = "analysis"
+ANALYSIS_WRAPPER = "workflow/scripts/analysis_wrapper.sh"
+AGGREGATE_WRAPPER = "workflow/scripts/aggregate_wrapper.sh"

 PLOT_DIR = config["plot_dir"]
 PLOT_SCRIPT = "plot/plot.r"
@ -42,21 +45,10 @@ rule all:
      analysis_cat = ["sources_stats", "build_status", "artifact"],
      date = DATE
    ),
-    # expand(f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv",
-    #   cat = ANALYSIS_CATS,
-    #   date = DATE
-    # ),
-    # expand(f"{PREFIX}/{{folder}}/{{artifact}}/{{date}}.csv",
-    #        folder=["pkgs", "build_status", "artifact_hash"],
-    #        artifact=ARTIFACTS,
-    #        date=DATE
-    # ),
-    # expand(f"{PREFIX}/logs/{{artifact}}/{{date}}.txt",
-    #     artifact=ARTIFACTS,
-    #     date=DATE
-    # ),
    f"{BLACKLIST_FOLDER}/{DATE}.csv"

+# Artifacts configuration files:
+
 rule check_all:
  input:
    expand(f"{ARTIFACTS_FOLDER_JSON}/{{artifact}}.json", artifact=ARTIFACTS)
@ -74,6 +66,8 @@ rule check_artifact:
    nickel export --format json --output {output} <<< 'let {{Artifact, ..}} = import "{input.contract}" in ((import "{input.artifact}") | Artifact)'
    """

+# ECG:
+
 rule run_ecg:
  input:
    "flake.nix",
@ -102,66 +96,60 @@ rule update_blacklist:
    # We need to ignore lines where build is successful:
    f"cat {{input}} | grep -v ',success' > {{output}} || true"

-# rule analysis:
-#   input:
-#     expand(f"{PREFIX}/{{output_dir}}/{{artifact}}/{{date}}.csv",
-#       output_dir = ECG_OUTPUTS,
-#       artifact = ARTIFACTS,
-#       # date = get_analysis_dates("{PREFIX}/{output_dir}")
-#       date = glob_wildcards("{PREFIX}/{output_dir}/{artifact}/{date}.csv").date
-#     )
+# Analysis:

 rule softenv_analysis:
  wildcard_constraints:
    date="\d+"
  input:
-    sources_stats = expand(f"{PREFIX}/pkgs/{{artifact}}/{{date}}.csv",
-      artifact = ARTIFACTS,
-      date = DATE
+    today_files = expand(f"{PREFIX}/pkgs/{{artifact}}/{{{{date}}}}.csv",
+      artifact = ARTIFACTS
+    ),
+    dirs = expand(f"{PREFIX}/pkgs/{{artifact}}",
+      artifact = ARTIFACTS
    ),
-    pkgs_changes = expand(f"{PREFIX}/pkgs/{{artifact}}/{{date}}.csv",
-      artifact = ARTIFACTS,
-      date = get_analysis_dates(f"{PREFIX}/pkgs")
-    )
  output:
    sources_stats = f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv",
    pkgs_changes = f"{ANALYSIS_DIR}/pkgs_changes/{{date}}.csv"
  shell:
    f"""
-    python3 analysis/softenv_analysis.py -t sources-stats -i {{input.sources_stats}} -o {{output.sources_stats}}
-    python3 analysis/softenv_analysis.py -t pkgs-changes -i {{input.pkgs_changes}} -o {{output.pkgs_changes}}
+    {ANALYSIS_WRAPPER} files {ANALYSIS_SCRIPTS_DIR}/softenv_analysis.py sources-stats {{output.sources_stats}} {{input.today_files}}
+    {ANALYSIS_WRAPPER} dirs {ANALYSIS_SCRIPTS_DIR}/softenv_analysis.py pkgs-changes {{output.pkgs_changes}} {{input.dirs}}
    """

 rule buildstatus_analysis:
  wildcard_constraints:
    date="\d+"
  input:
-    expand(f"{PREFIX}/build_status/{{artifact}}/{{date}}.csv",
-      artifact = ARTIFACTS,
-      date = DATE
-    )
+    expand(f"{PREFIX}/build_status/{{artifact}}/{{{{date}}}}.csv",
+      artifact = ARTIFACTS
+    ),
  output:
    f"{ANALYSIS_DIR}/build_status/{{date}}.csv",
  shell:
    f"""
-    python3 analysis/buildstatus_analysis.py -i {{input}} -o {{output}}
+    {ANALYSIS_WRAPPER} files {ANALYSIS_SCRIPTS_DIR}/buildstatus_analysis.py {{output}} {{input}}
    """

 rule artifact_analysis:
  wildcard_constraints:
    date="\d+"
  input:
-    expand(f"{PREFIX}/artifact_hash/{{artifact}}/{{date}}.csv",
-      artifact = ARTIFACTS,
-      date = get_analysis_dates(f"{PREFIX}/artifact_hash")
-    )
+    today_files = expand(f"{PREFIX}/artifact_hash/{{artifact}}/{{{{date}}}}.csv",
+      artifact = ARTIFACTS
+    ),
+    dirs = expand(f"{PREFIX}/artifact_hash/{{artifact}}",
+      artifact = ARTIFACTS
+    ),
  output:
    f"{ANALYSIS_DIR}/artifact/{{date}}.csv",
  shell:
    f"""
-    python3 analysis/artifact_analysis.py -i {{input}} -o {{output}}
+    {ANALYSIS_WRAPPER} dirs {ANALYSIS_SCRIPTS_DIR}/artifact_analysis.py {{output}} {{input.dirs}}
    """

+# Analysis aggregate:
+
 rule analysis_aggregate:
  input:
    expand(f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv",
@ -169,56 +157,43 @@ rule analysis_aggregate:
      date = DATE
    )

-# rule single_aggregate:
-#   input:
-#     expand(f"{ANALYSIS_DIR}/{{{{cat}}}}/{{date}}.csv",
-#       date = get_analysis_dates(f"{ANALYSIS_DIR}/{{wildcards.cat}}")
-#       # date = glob_wildcards("{ANALYSIS_DIR}/{cat}/{date}.csv").date
-#     )
-#   output:
-#     f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv"
-#   shell:
-#     f"cat {{input}} > {{output}}"
-
 rule pkgschgs_aggregate:
  input:
-    expand(f"{ANALYSIS_DIR}/pkgs_changes/{{date}}.csv",
-      date = get_analysis_dates(f"{ANALYSIS_DIR}/pkgs_changes")
-    )
+    dir = f"{ANALYSIS_DIR}/pkgs_changes",
+    today_file = f"{ANALYSIS_DIR}/pkgs_changes/{{date}}.csv"
  output:
    f"{ANALYSIS_DIR}/pkgs_changes/aggregated/{{date}}.csv"
  shell:
-    f"cat {{input}} > {{output}}"
+    f"{AGGREGATE_WRAPPER} {{input.dir}} {{output}}"

 rule srcsstats_aggregate:
  input:
-    expand(f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv",
-      date = get_analysis_dates(f"{ANALYSIS_DIR}/sources_stats")
-    )
+    dir = f"{ANALYSIS_DIR}/sources_stats",
+    today_file = f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv"
  output:
    f"{ANALYSIS_DIR}/sources_stats/aggregated/{{date}}.csv"
  shell:
-    f"cat {{input}} > {{output}}"
+    f"{AGGREGATE_WRAPPER} {{input.dir}} {{output}}"

 rule artifact_aggregate:
  input:
-    expand(f"{ANALYSIS_DIR}/artifact/{{date}}.csv",
-      date = get_analysis_dates(f"{ANALYSIS_DIR}/artifact")
-    )
+    dir = f"{ANALYSIS_DIR}/artifact",
+    today_file = f"{ANALYSIS_DIR}/artifact/{{date}}.csv"
  output:
    f"{ANALYSIS_DIR}/artifact/aggregated/{{date}}.csv"
  shell:
-    f"cat {{input}} > {{output}}"
+    f"{AGGREGATE_WRAPPER} {{input.dir}} {{output}}"

 rule buildstatus_aggregate:
  input:
-    expand(f"{ANALYSIS_DIR}/build_status/{{date}}.csv",
-      date = get_analysis_dates(f"{ANALYSIS_DIR}/build_status")
-    )
+    dir = f"{ANALYSIS_DIR}/build_status",
+    today_file = f"{ANALYSIS_DIR}/build_status/{{date}}.csv"
  output:
    f"{ANALYSIS_DIR}/build_status/aggregated/{{date}}.csv"
  shell:
-    f"cat {{input}} > {{output}}"
+    f"{AGGREGATE_WRAPPER} {{input.dir}} {{output}}"
+
+# Plot:

 rule all_plot:
  input:
--- a/workflow/scripts/aggregate_wrapper.sh
+++ b/workflow/scripts/aggregate_wrapper.sh
@ -0,0 +1,8 @@
+#!/bin/bash
+
+INPUT_DIR=$1
+
+INPUT=$(find $INPUT_DIR/*.csv -maxdepth 1 -type f)
+OUTPUT=$2
+
+cat $INPUT > $OUTPUT
--- a/workflow/scripts/analysis_wrapper.sh
+++ b/workflow/scripts/analysis_wrapper.sh
@ -0,0 +1,42 @@
+#!/bin/bash
+
+echo "$@"
+
+MODE=$1 # Either "dirs" or "files", depending on the type of input
+shift
+SCRIPT=$1
+shift
+TYPE=""
+if [ $1 = "-t" ]
+then
+    TYPE=$2 # Used if softenv analysis
+    shift
+else
+    OUTPUT=$1
+fi
+shift
+INPUT="$@"
+
+echo $OUTPUT
+
+# Adding option prefix:
+if [ $TYPE != "" ]
+then
+    TYPE="-t $TYPE"
+fi
+
+# If inputs are files, then we just use that as input for the script:
+INPUT_FILES=$INPUT
+# If inputs are directories, we need to explore every single one of them
+# to find the input files to pass to the script:
+if [ $MODE = "dirs" ]
+then
+    INPUT_FILES=""
+    for dir in $INPUT
+    do
+        INPUT_FILES="$INPUT_FILES $(find $dir/*.csv -maxdepth 1 -type f)"
+    done
+fi
+echo $INPUT_FILES
+
+python3 $SCRIPT $TYPE -i $INPUT_FILES -o $OUTPUT
--- a/workflow/utils.smk
+++ b/workflow/utils.smk
@ -1,29 +1,6 @@
 import csv
 import os

-def get_analysis_dates(directory):
-    outputs = []
-    if os.path.exists(directory):
-        for file in os.listdir(directory):
-            if not os.path.isdir(os.path.join(directory, file)):
-                outputs.append(os.path.splitext(file)[0])
-    today = datetime.datetime.now().strftime("%Y%m%d")
-    if today not in outputs:
-        outputs.append(today)
-    return outputs
-
-# def find_last_blacklist(blacklist_dir_path):
-#     last_blacklist = "0.csv"
-#     for blacklist in os.listdir(blacklist_dir_path):
-#         if not os.path.isdir(blacklist):
-#             # We want the latest one, so the one that has the most recent date
-#             # as file name:
-#             curbl_date = int(os.path.splitext(blacklist)[0])
-#             lastbl_date = int(os.path.splitext(last_blacklist)[0])
-#             if curbl_date > lastbl_date:
-#                 last_blacklist = blacklist
-#     return last_blacklist
-
 def get_blacklisted(blacklist_dir_path):
    blacklisted = set()
    if os.path.exists(blacklist_dir_path):