From 97447e59a103d95e2d567e070f1731a6e4d61b60 Mon Sep 17 00:00:00 2001
From: antux18 <antux18@gmail.com>
Date: Fri, 23 Aug 2024 16:14:46 +0200
Subject: [PATCH] The results of the get_analysis_dates function being used as
 inputs in some Snakemake rules, this causes Snakemake to overwrite older
 analysis, probably because the files used to generate these older analysis
 have been modified. To avoid overwriting older analysis, we only specify
 today's analysis in the input, and use a bash wrapper script to both fetch
 the older analysis by itself (by giving it the folder where to look), and run
 the analysis with the fetched files. So I removed get_analysis_dates and
 replaced it with the wrapper script every time it was used. I also removed
 the older unfinished analysis wrapper that I forgot it existed...

---
 analysis/analysis_wrapper.sh          |  18 -----
 workflow/Snakefile                    | 109 ++++++++++----------------
 workflow/scripts/aggregate_wrapper.sh |   8 ++
 workflow/scripts/analysis_wrapper.sh  |  42 ++++++++++
 workflow/utils.smk                    |  23 ------
 5 files changed, 92 insertions(+), 108 deletions(-)
 delete mode 100755 analysis/analysis_wrapper.sh
 create mode 100755 workflow/scripts/aggregate_wrapper.sh
 create mode 100755 workflow/scripts/analysis_wrapper.sh

diff --git a/analysis/analysis_wrapper.sh b/analysis/analysis_wrapper.sh
deleted file mode 100755
index 005d56c..0000000
--- a/analysis/analysis_wrapper.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-ANALYSIS_TYPE=$1
-OUTPUT=$2
-shift; shift
-
-ARGS=$@
-INPUT=("${ARGS[@]/#/-i }")
-SCRIPT=""
-OPT=""
-
-case ANALYSIS_TYPE in
-    "softenv")
-        SCRIPT="softenv_analysis.py"
-        OPT="-t sources-stats"
-    ;;
-
-python3 softenv_analysis.py -t sources-stats $INPUT -o
\ No newline at end of file
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 12b2da5..e9dbab3 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -21,6 +21,9 @@ SHELLS_ECG = {
 
 ANALYSIS_DIR = config["analysis_dir"]
 ANALYSIS_CATS = ["sources_stats", "pkgs_changes", "build_status", "artifact"]
+ANALYSIS_SCRIPTS_DIR = "analysis"
+ANALYSIS_WRAPPER = "workflow/scripts/analysis_wrapper.sh"
+AGGREGATE_WRAPPER = "workflow/scripts/aggregate_wrapper.sh"
 
 PLOT_DIR = config["plot_dir"]
 PLOT_SCRIPT = "plot/plot.r"
@@ -42,21 +45,10 @@ rule all:
       analysis_cat = ["sources_stats", "build_status", "artifact"],
       date = DATE
     ),
-    # expand(f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv",
-    #   cat = ANALYSIS_CATS,
-    #   date = DATE
-    # ),
-    # expand(f"{PREFIX}/{{folder}}/{{artifact}}/{{date}}.csv",
-    #        folder=["pkgs", "build_status", "artifact_hash"],
-    #        artifact=ARTIFACTS,
-    #        date=DATE
-    # ),
-    # expand(f"{PREFIX}/logs/{{artifact}}/{{date}}.txt",
-    #     artifact=ARTIFACTS,
-    #     date=DATE
-    # ),
     f"{BLACKLIST_FOLDER}/{DATE}.csv"
 
+# Artifacts configuration files:
+
 rule check_all:
   input:
     expand(f"{ARTIFACTS_FOLDER_JSON}/{{artifact}}.json", artifact=ARTIFACTS)
@@ -74,6 +66,8 @@ rule check_artifact:
     nickel export --format json --output {output} <<< 'let {{Artifact, ..}} = import "{input.contract}" in ((import "{input.artifact}") | Artifact)'
     """
 
+# ECG:
+
 rule run_ecg:
   input:
     "flake.nix",
@@ -102,66 +96,60 @@ rule update_blacklist:
     # We need to ignore lines where build is successful:
     f"cat {{input}} | grep -v ',success' > {{output}} || true"
 
-# rule analysis:
-#   input:
-#     expand(f"{PREFIX}/{{output_dir}}/{{artifact}}/{{date}}.csv",
-#       output_dir = ECG_OUTPUTS,
-#       artifact = ARTIFACTS,
-#       # date = get_analysis_dates("{PREFIX}/{output_dir}")
-#       date = glob_wildcards("{PREFIX}/{output_dir}/{artifact}/{date}.csv").date
-#     )
+# Analysis:
 
 rule softenv_analysis:
   wildcard_constraints:
     date="\d+"
   input:
-    sources_stats = expand(f"{PREFIX}/pkgs/{{artifact}}/{{date}}.csv",
-      artifact = ARTIFACTS,
-      date = DATE
+    today_files = expand(f"{PREFIX}/pkgs/{{artifact}}/{{{{date}}}}.csv",
+      artifact = ARTIFACTS
+    ),
+    dirs = expand(f"{PREFIX}/pkgs/{{artifact}}",
+      artifact = ARTIFACTS
     ),
-    pkgs_changes = expand(f"{PREFIX}/pkgs/{{artifact}}/{{date}}.csv",
-      artifact = ARTIFACTS,
-      date = get_analysis_dates(f"{PREFIX}/pkgs")
-    )
   output:
     sources_stats = f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv",
     pkgs_changes = f"{ANALYSIS_DIR}/pkgs_changes/{{date}}.csv"
   shell:
     f"""
-    python3 analysis/softenv_analysis.py -t sources-stats -i {{input.sources_stats}} -o {{output.sources_stats}}
-    python3 analysis/softenv_analysis.py -t pkgs-changes -i {{input.pkgs_changes}} -o {{output.pkgs_changes}}
+    {ANALYSIS_WRAPPER} files {ANALYSIS_SCRIPTS_DIR}/softenv_analysis.py sources-stats {{output.sources_stats}} {{input.today_files}}
+    {ANALYSIS_WRAPPER} dirs {ANALYSIS_SCRIPTS_DIR}/softenv_analysis.py pkgs-changes {{output.pkgs_changes}} {{input.dirs}}
     """
 
 rule buildstatus_analysis:
   wildcard_constraints:
     date="\d+"
   input:
-    expand(f"{PREFIX}/build_status/{{artifact}}/{{date}}.csv",
-      artifact = ARTIFACTS,
-      date = DATE
-    )
+    expand(f"{PREFIX}/build_status/{{artifact}}/{{{{date}}}}.csv",
+      artifact = ARTIFACTS
+    ),
   output:
     f"{ANALYSIS_DIR}/build_status/{{date}}.csv",
   shell:
     f"""
-    python3 analysis/buildstatus_analysis.py -i {{input}} -o {{output}}
+    {ANALYSIS_WRAPPER} files {ANALYSIS_SCRIPTS_DIR}/buildstatus_analysis.py {{output}} {{input}}
     """
 
 rule artifact_analysis:
   wildcard_constraints:
     date="\d+"
   input:
-    expand(f"{PREFIX}/artifact_hash/{{artifact}}/{{date}}.csv",
-      artifact = ARTIFACTS,
-      date = get_analysis_dates(f"{PREFIX}/artifact_hash")
-    )
+    today_files = expand(f"{PREFIX}/artifact_hash/{{artifact}}/{{{{date}}}}.csv",
+      artifact = ARTIFACTS
+    ),
+    dirs = expand(f"{PREFIX}/artifact_hash/{{artifact}}",
+      artifact = ARTIFACTS
+    ),
   output:
     f"{ANALYSIS_DIR}/artifact/{{date}}.csv",
   shell:
     f"""
-    python3 analysis/artifact_analysis.py -i {{input}} -o {{output}}
+    {ANALYSIS_WRAPPER} dirs {ANALYSIS_SCRIPTS_DIR}/artifact_analysis.py {{output}} {{input.dirs}}
     """
 
+# Analysis aggregate:
+
 rule analysis_aggregate:
   input:
     expand(f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv",
@@ -169,56 +157,43 @@ rule analysis_aggregate:
       date = DATE
     )
 
-# rule single_aggregate:
-#   input:
-#     expand(f"{ANALYSIS_DIR}/{{{{cat}}}}/{{date}}.csv",
-#       date = get_analysis_dates(f"{ANALYSIS_DIR}/{{wildcards.cat}}")
-#       # date = glob_wildcards("{ANALYSIS_DIR}/{cat}/{date}.csv").date
-#     )
-#   output:
-#     f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv"
-#   shell:
-#     f"cat {{input}} > {{output}}"
-
 rule pkgschgs_aggregate:
   input:
-    expand(f"{ANALYSIS_DIR}/pkgs_changes/{{date}}.csv",
-      date = get_analysis_dates(f"{ANALYSIS_DIR}/pkgs_changes")
-    )
+    dir = f"{ANALYSIS_DIR}/pkgs_changes",
+    today_file = f"{ANALYSIS_DIR}/pkgs_changes/{{date}}.csv"
   output:
     f"{ANALYSIS_DIR}/pkgs_changes/aggregated/{{date}}.csv"
   shell:
-    f"cat {{input}} > {{output}}"
+    f"{AGGREGATE_WRAPPER} {{input.dir}} {{output}}"
 
 rule srcsstats_aggregate:
   input:
-    expand(f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv",
-      date = get_analysis_dates(f"{ANALYSIS_DIR}/sources_stats")
-    )
+    dir = f"{ANALYSIS_DIR}/sources_stats",
+    today_file = f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv"
   output:
     f"{ANALYSIS_DIR}/sources_stats/aggregated/{{date}}.csv"
   shell:
-    f"cat {{input}} > {{output}}"
+    f"{AGGREGATE_WRAPPER} {{input.dir}} {{output}}"
 
 rule artifact_aggregate:
   input:
-    expand(f"{ANALYSIS_DIR}/artifact/{{date}}.csv",
-      date = get_analysis_dates(f"{ANALYSIS_DIR}/artifact")
-    )
+    dir = f"{ANALYSIS_DIR}/artifact",
+    today_file = f"{ANALYSIS_DIR}/artifact/{{date}}.csv"
   output:
     f"{ANALYSIS_DIR}/artifact/aggregated/{{date}}.csv"
   shell:
-    f"cat {{input}} > {{output}}"
+    f"{AGGREGATE_WRAPPER} {{input.dir}} {{output}}"
 
 rule buildstatus_aggregate:
   input:
-    expand(f"{ANALYSIS_DIR}/build_status/{{date}}.csv",
-      date = get_analysis_dates(f"{ANALYSIS_DIR}/build_status")
-    )
+    dir = f"{ANALYSIS_DIR}/build_status",
+    today_file = f"{ANALYSIS_DIR}/build_status/{{date}}.csv"
   output:
     f"{ANALYSIS_DIR}/build_status/aggregated/{{date}}.csv"
   shell:
-    f"cat {{input}} > {{output}}"
+    f"{AGGREGATE_WRAPPER} {{input.dir}} {{output}}"
+
+# Plot:
 
 rule all_plot:
   input:
diff --git a/workflow/scripts/aggregate_wrapper.sh b/workflow/scripts/aggregate_wrapper.sh
new file mode 100755
index 0000000..fb69d97
--- /dev/null
+++ b/workflow/scripts/aggregate_wrapper.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+INPUT_DIR=$1
+
+INPUT=$(find $INPUT_DIR/*.csv -maxdepth 1 -type f)
+OUTPUT=$2
+
+cat $INPUT > $OUTPUT
\ No newline at end of file
diff --git a/workflow/scripts/analysis_wrapper.sh b/workflow/scripts/analysis_wrapper.sh
new file mode 100755
index 0000000..59a5407
--- /dev/null
+++ b/workflow/scripts/analysis_wrapper.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+echo "$@"
+
+MODE=$1 # Either "dirs" or "files", depending on the type of input
+shift
+SCRIPT=$1
+shift
+TYPE=""
+if [ $1 = "-t" ]
+then
+    TYPE=$2 # Used if softenv analysis
+    shift
+else
+    OUTPUT=$1
+fi
+shift
+INPUT="$@"
+
+echo $OUTPUT
+
+# Adding option prefix:
+if [ $TYPE != "" ]
+then
+    TYPE="-t $TYPE"
+fi
+
+# If inputs are files, then we just use that as input for the script:
+INPUT_FILES=$INPUT
+# If inputs are directories, we need to explore every single one of them
+# to find the input files to pass to the script:
+if [ $MODE = "dirs" ]
+then
+    INPUT_FILES=""
+    for dir in $INPUT
+    do
+        INPUT_FILES="$INPUT_FILES $(find $dir/*.csv -maxdepth 1 -type f)"
+    done
+fi
+echo $INPUT_FILES
+
+python3 $SCRIPT $TYPE -i $INPUT_FILES -o $OUTPUT
\ No newline at end of file
diff --git a/workflow/utils.smk b/workflow/utils.smk
index 9b71ab0..f695f08 100644
--- a/workflow/utils.smk
+++ b/workflow/utils.smk
@@ -1,29 +1,6 @@
 import csv
 import os
 
-def get_analysis_dates(directory):
-    outputs = []
-    if os.path.exists(directory):
-        for file in os.listdir(directory):
-            if not os.path.isdir(os.path.join(directory, file)):
-                outputs.append(os.path.splitext(file)[0])
-    today = datetime.datetime.now().strftime("%Y%m%d")
-    if today not in outputs:
-        outputs.append(today)
-    return outputs
-
-# def find_last_blacklist(blacklist_dir_path):
-#     last_blacklist = "0.csv"
-#     for blacklist in os.listdir(blacklist_dir_path):
-#         if not os.path.isdir(blacklist):
-#             # We want the latest one, so the one that has the most recent date
-#             # as file name:
-#             curbl_date = int(os.path.splitext(blacklist)[0])
-#             lastbl_date = int(os.path.splitext(last_blacklist)[0])
-#             if curbl_date > lastbl_date:
-#                 last_blacklist = blacklist
-#     return last_blacklist
-
 def get_blacklisted(blacklist_dir_path):
     blacklisted = set()
     if os.path.exists(blacklist_dir_path):