The results of the get_analysis_dates function being used as inputs in some Snakemake rules, this causes Snakemake to overwrite older analysis, probably because the files used to generate these older analysis have been modified. To avoid overwriting older analysis, we only specify today's analysis in the input, and use a bash wrapper script to both fetch the older analysis by itself (by giving it the folder where to look), and run the analysis with the fetched files. So I removed get_analysis_dates and replaced it with the wrapper script every time it was used.

I also removed the older unfinished analysis wrapper that I forgot it existed...
This commit is contained in:
antux18 2024-08-23 16:14:46 +02:00
parent 514e186c3d
commit 97447e59a1
5 changed files with 92 additions and 108 deletions

View File

@ -1,18 +0,0 @@
#!/bin/bash
ANALYSIS_TYPE=$1
OUTPUT=$2
shift; shift
ARGS=$@
INPUT=("${ARGS[@]/#/-i }")
SCRIPT=""
OPT=""
case ANALYSIS_TYPE in
"softenv")
SCRIPT="softenv_analysis.py"
OPT="-t sources-stats"
;;
python3 softenv_analysis.py -t sources-stats $INPUT -o

View File

@ -21,6 +21,9 @@ SHELLS_ECG = {
ANALYSIS_DIR = config["analysis_dir"]
ANALYSIS_CATS = ["sources_stats", "pkgs_changes", "build_status", "artifact"]
ANALYSIS_SCRIPTS_DIR = "analysis"
ANALYSIS_WRAPPER = "workflow/scripts/analysis_wrapper.sh"
AGGREGATE_WRAPPER = "workflow/scripts/aggregate_wrapper.sh"
PLOT_DIR = config["plot_dir"]
PLOT_SCRIPT = "plot/plot.r"
@ -42,21 +45,10 @@ rule all:
analysis_cat = ["sources_stats", "build_status", "artifact"],
date = DATE
),
# expand(f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv",
# cat = ANALYSIS_CATS,
# date = DATE
# ),
# expand(f"{PREFIX}/{{folder}}/{{artifact}}/{{date}}.csv",
# folder=["pkgs", "build_status", "artifact_hash"],
# artifact=ARTIFACTS,
# date=DATE
# ),
# expand(f"{PREFIX}/logs/{{artifact}}/{{date}}.txt",
# artifact=ARTIFACTS,
# date=DATE
# ),
f"{BLACKLIST_FOLDER}/{DATE}.csv"
# Artifacts configuration files:
rule check_all:
input:
expand(f"{ARTIFACTS_FOLDER_JSON}/{{artifact}}.json", artifact=ARTIFACTS)
@ -74,6 +66,8 @@ rule check_artifact:
nickel export --format json --output {output} <<< 'let {{Artifact, ..}} = import "{input.contract}" in ((import "{input.artifact}") | Artifact)'
"""
# ECG:
rule run_ecg:
input:
"flake.nix",
@ -102,66 +96,60 @@ rule update_blacklist:
# We need to ignore lines where build is successful:
f"cat {{input}} | grep -v ',success' > {{output}} || true"
# rule analysis:
# input:
# expand(f"{PREFIX}/{{output_dir}}/{{artifact}}/{{date}}.csv",
# output_dir = ECG_OUTPUTS,
# artifact = ARTIFACTS,
# # date = get_analysis_dates("{PREFIX}/{output_dir}")
# date = glob_wildcards("{PREFIX}/{output_dir}/{artifact}/{date}.csv").date
# )
# Analysis:
rule softenv_analysis:
wildcard_constraints:
date="\d+"
input:
sources_stats = expand(f"{PREFIX}/pkgs/{{artifact}}/{{date}}.csv",
artifact = ARTIFACTS,
date = DATE
today_files = expand(f"{PREFIX}/pkgs/{{artifact}}/{{{{date}}}}.csv",
artifact = ARTIFACTS
),
dirs = expand(f"{PREFIX}/pkgs/{{artifact}}",
artifact = ARTIFACTS
),
pkgs_changes = expand(f"{PREFIX}/pkgs/{{artifact}}/{{date}}.csv",
artifact = ARTIFACTS,
date = get_analysis_dates(f"{PREFIX}/pkgs")
)
output:
sources_stats = f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv",
pkgs_changes = f"{ANALYSIS_DIR}/pkgs_changes/{{date}}.csv"
shell:
f"""
python3 analysis/softenv_analysis.py -t sources-stats -i {{input.sources_stats}} -o {{output.sources_stats}}
python3 analysis/softenv_analysis.py -t pkgs-changes -i {{input.pkgs_changes}} -o {{output.pkgs_changes}}
{ANALYSIS_WRAPPER} files {ANALYSIS_SCRIPTS_DIR}/softenv_analysis.py sources-stats {{output.sources_stats}} {{input.today_files}}
{ANALYSIS_WRAPPER} dirs {ANALYSIS_SCRIPTS_DIR}/softenv_analysis.py pkgs-changes {{output.pkgs_changes}} {{input.dirs}}
"""
rule buildstatus_analysis:
wildcard_constraints:
date="\d+"
input:
expand(f"{PREFIX}/build_status/{{artifact}}/{{date}}.csv",
artifact = ARTIFACTS,
date = DATE
)
expand(f"{PREFIX}/build_status/{{artifact}}/{{{{date}}}}.csv",
artifact = ARTIFACTS
),
output:
f"{ANALYSIS_DIR}/build_status/{{date}}.csv",
shell:
f"""
python3 analysis/buildstatus_analysis.py -i {{input}} -o {{output}}
{ANALYSIS_WRAPPER} files {ANALYSIS_SCRIPTS_DIR}/buildstatus_analysis.py {{output}} {{input}}
"""
rule artifact_analysis:
wildcard_constraints:
date="\d+"
input:
expand(f"{PREFIX}/artifact_hash/{{artifact}}/{{date}}.csv",
artifact = ARTIFACTS,
date = get_analysis_dates(f"{PREFIX}/artifact_hash")
)
today_files = expand(f"{PREFIX}/artifact_hash/{{artifact}}/{{{{date}}}}.csv",
artifact = ARTIFACTS
),
dirs = expand(f"{PREFIX}/artifact_hash/{{artifact}}",
artifact = ARTIFACTS
),
output:
f"{ANALYSIS_DIR}/artifact/{{date}}.csv",
shell:
f"""
python3 analysis/artifact_analysis.py -i {{input}} -o {{output}}
{ANALYSIS_WRAPPER} dirs {ANALYSIS_SCRIPTS_DIR}/artifact_analysis.py {{output}} {{input.dirs}}
"""
# Analysis aggregate:
rule analysis_aggregate:
input:
expand(f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv",
@ -169,56 +157,43 @@ rule analysis_aggregate:
date = DATE
)
# rule single_aggregate:
# input:
# expand(f"{ANALYSIS_DIR}/{{{{cat}}}}/{{date}}.csv",
# date = get_analysis_dates(f"{ANALYSIS_DIR}/{{wildcards.cat}}")
# # date = glob_wildcards("{ANALYSIS_DIR}/{cat}/{date}.csv").date
# )
# output:
# f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv"
# shell:
# f"cat {{input}} > {{output}}"
rule pkgschgs_aggregate:
input:
expand(f"{ANALYSIS_DIR}/pkgs_changes/{{date}}.csv",
date = get_analysis_dates(f"{ANALYSIS_DIR}/pkgs_changes")
)
dir = f"{ANALYSIS_DIR}/pkgs_changes",
today_file = f"{ANALYSIS_DIR}/pkgs_changes/{{date}}.csv"
output:
f"{ANALYSIS_DIR}/pkgs_changes/aggregated/{{date}}.csv"
shell:
f"cat {{input}} > {{output}}"
f"{AGGREGATE_WRAPPER} {{input.dir}} {{output}}"
rule srcsstats_aggregate:
input:
expand(f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv",
date = get_analysis_dates(f"{ANALYSIS_DIR}/sources_stats")
)
dir = f"{ANALYSIS_DIR}/sources_stats",
today_file = f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv"
output:
f"{ANALYSIS_DIR}/sources_stats/aggregated/{{date}}.csv"
shell:
f"cat {{input}} > {{output}}"
f"{AGGREGATE_WRAPPER} {{input.dir}} {{output}}"
rule artifact_aggregate:
input:
expand(f"{ANALYSIS_DIR}/artifact/{{date}}.csv",
date = get_analysis_dates(f"{ANALYSIS_DIR}/artifact")
)
dir = f"{ANALYSIS_DIR}/artifact",
today_file = f"{ANALYSIS_DIR}/artifact/{{date}}.csv"
output:
f"{ANALYSIS_DIR}/artifact/aggregated/{{date}}.csv"
shell:
f"cat {{input}} > {{output}}"
f"{AGGREGATE_WRAPPER} {{input.dir}} {{output}}"
rule buildstatus_aggregate:
input:
expand(f"{ANALYSIS_DIR}/build_status/{{date}}.csv",
date = get_analysis_dates(f"{ANALYSIS_DIR}/build_status")
)
dir = f"{ANALYSIS_DIR}/build_status",
today_file = f"{ANALYSIS_DIR}/build_status/{{date}}.csv"
output:
f"{ANALYSIS_DIR}/build_status/aggregated/{{date}}.csv"
shell:
f"cat {{input}} > {{output}}"
f"{AGGREGATE_WRAPPER} {{input.dir}} {{output}}"
# Plot:
rule all_plot:
input:

View File

@ -0,0 +1,8 @@
#!/bin/bash
INPUT_DIR=$1
INPUT=$(find $INPUT_DIR/*.csv -maxdepth 1 -type f)
OUTPUT=$2
cat $INPUT > $OUTPUT

View File

@ -0,0 +1,42 @@
#!/bin/bash
echo "$@"
MODE=$1 # Either "dirs" or "files", depending on the type of input
shift
SCRIPT=$1
shift
TYPE=""
if [ $1 = "-t" ]
then
TYPE=$2 # Used if softenv analysis
shift
else
OUTPUT=$1
fi
shift
INPUT="$@"
echo $OUTPUT
# Adding option prefix:
if [ $TYPE != "" ]
then
TYPE="-t $TYPE"
fi
# If inputs are files, then we just use that as input for the script:
INPUT_FILES=$INPUT
# If inputs are directories, we need to explore every single one of them
# to find the input files to pass to the script:
if [ $MODE = "dirs" ]
then
INPUT_FILES=""
for dir in $INPUT
do
INPUT_FILES="$INPUT_FILES $(find $dir/*.csv -maxdepth 1 -type f)"
done
fi
echo $INPUT_FILES
python3 $SCRIPT $TYPE -i $INPUT_FILES -o $OUTPUT

View File

@ -1,29 +1,6 @@
import csv
import os
def get_analysis_dates(directory):
outputs = []
if os.path.exists(directory):
for file in os.listdir(directory):
if not os.path.isdir(os.path.join(directory, file)):
outputs.append(os.path.splitext(file)[0])
today = datetime.datetime.now().strftime("%Y%m%d")
if today not in outputs:
outputs.append(today)
return outputs
# def find_last_blacklist(blacklist_dir_path):
# last_blacklist = "0.csv"
# for blacklist in os.listdir(blacklist_dir_path):
# if not os.path.isdir(blacklist):
# # We want the latest one, so the one that has the most recent date
# # as file name:
# curbl_date = int(os.path.splitext(blacklist)[0])
# lastbl_date = int(os.path.splitext(last_blacklist)[0])
# if curbl_date > lastbl_date:
# last_blacklist = blacklist
# return last_blacklist
def get_blacklisted(blacklist_dir_path):
blacklisted = set()
if os.path.exists(blacklist_dir_path):