study-docker-repro-longevity/workflow/Snakefile

configfile: "config/config.yaml"

include: "utils.smk"

import os
import datetime
DATE = datetime.datetime.now().strftime("%Y%m%d")

ARTIFACTS_FOLDER_NICKEL = config["folder_artifacts_nickel"]
ARTIFACTS_FOLDER_JSON   = config["folder_artifacts_json"]
SYSTEM = config["system"]
CONFERENCE = config["conference"]

ARTIFACTS = get_artifacts_to_build(ARTIFACTS_FOLDER_NICKEL + "/" + CONFERENCE)

PREFIX = config["prefix"]
ECG_OUTPUTS = ["pkgs", "build_status", "artifact_hash"]

ANALYSIS_DIR = config["analysis_dir"]
ANALYSIS_CATS = ["sources_stats", "pkgs_changes", "build_status", "artifact"]
ANALYSIS_SCRIPTS_DIR = "analysis"
ANALYSIS_WRAPPER = "workflow/scripts/analysis_wrapper.sh"
ARTIFACT_ANALYSIS_DIRS = " ".join(expand(f"{PREFIX}/artifact_hash/{{artifact}}",
  artifact = ARTIFACTS
))
SOFTENV_ANALYSIS_DIRS = " ".join(expand(f"{PREFIX}/pkgs/{{artifact}}",
  artifact = ARTIFACTS
))

PLOT_DIR = config["plot_dir"]
PLOT_HEADERS = {
  #"softenv": "dpkg rpm pacman pip conda git misc",
  "sources_stats": "dpkg rpm pacman pip conda git misc",
  "pkgs_changes": "dpkg rpm pacman pip conda git misc",
  "build_status": "success package_install_failed baseimage_unavailable artifact_unavailable dockerfile_not_found script_crash job_time_exceeded unknown_error",
  "artifact": "available unavailable changed"
}

rule all:
  input:
    expand(f"{PREFIX}/{{conference}}/build_status/{{artifact}}/{{date}}.csv",\
           conference=config['conference'],\
           artifact=ARTIFACTS,\
           date = DATE)

# Artifacts configuration files:

rule check_all:
  input:
    expand(f"{ARTIFACTS_FOLDER_JSON}/{{conference}}/{{artifact}}.json", artifact=ARTIFACTS, conference=config['conference'])

rule check_artifact:
  input:
    "flake.nix",
    "flake.lock",
    contract="workflow/nickel/artifact_contract.ncl",
    artifact=f"{ARTIFACTS_FOLDER_NICKEL}/{{conference}}/{{artifact}}.ncl"
  output:
    f"{ARTIFACTS_FOLDER_JSON}/{{conference}}/{{artifact}}.json"
  shell:
    """
    nix develop .#nickel --command nickel export --format json --output {output} <<< 'let {{Artifact, ..}} = import "{input.contract}" in ((import "{input.artifact}") | Artifact)'
    """

# ECG:

rule run_ecg:
  input:
    "flake.nix",
    "flake.lock",
    ecg="ecg/app/ecg.py",
    execo_wrapper="workflow/scripts/submission_g5k.py",
    oar_wrapper="workflow/scripts/ecg_oar_wrapper.oar.bash",
    artifact=f"{ARTIFACTS_FOLDER_JSON}/{{conference}}/{{artifact}}.json"
  output:
    log           = f"{PREFIX}/{{conference}}/logs/{{artifact}}/{{date}}.txt",
    pkg           = f"{PREFIX}/{{conference}}/pkgs/{{artifact}}/{{date}}.csv",
    build_status  = f"{PREFIX}/{{conference}}/build_status/{{artifact}}/{{date}}.csv",
    artifact_hash = f"{PREFIX}/{{conference}}/artifact_hash/{{artifact}}/{{date}}.csv",
  shell:
    (f"python3 {{input.execo_wrapper}} --path {os.getcwd()} \
                                       --script {{input.oar_wrapper}} \
                                       --site {config['site']} \
                                       --cluster {config['cluster']} \
                                       --max-duration {config['max_duration']} \
                                       --checkpoint {config['checkpoint']} \
                                     {'--besteffort' if config['besteffort'] else ''} \
                                       --sleep_time {config['sleep_time']} \
                                       --build_status_file {{output.build_status}} \
                                       --artifact {{wildcards.artifact}} -- '" if SYSTEM == "g5k" else "") + \
    """
    nix shell .#ecg --command ecg -p {output.pkg} -b {output.build_status} -a {output.artifact_hash} {input.artifact} &> {output.log} || echo "{input.artifact}, `date +%s.%N`, script_crash" > {output.build_status}
    """ + \
    ("'" if SYSTEM == "g5k" else "")

# Analysis:

#rule softenv_analysis:
#  wildcard_constraints:
#    date="\d+"
#  input:
#    expand(f"{PREFIX}{{conference}}/pkgs/{{artifact}}/{{{{date}}}}.csv",
#      artifact = ARTIFACTS
#    )
#  output:
#    sources_stats = f"{ANALYSIS_DIR}/sources_stats/{{date}}.csv",
#    pkgs_changes = f"{ANALYSIS_DIR}/pkgs_changes/{{date}}.csv"
#  shell:
#    f"""
#    {ANALYSIS_WRAPPER} files {ANALYSIS_SCRIPTS_DIR}/softenv_analysis.py -t sources-stats {{output.sources_stats}} {{input}}
#    {ANALYSIS_WRAPPER} dirs {ANALYSIS_SCRIPTS_DIR}/softenv_analysis.py -t pkgs-changes {{output.pkgs_changes}} {SOFTENV_ANALYSIS_DIRS}
#    """
#
#rule buildstatus_analysis:
#  wildcard_constraints:
#    date="\d+"
#  input:
#    expand(f"{PREFIX}/build_status/{{artifact}}/{{{{date}}}}.csv",
#      artifact = ARTIFACTS
#    ),
#  output:
#    f"{ANALYSIS_DIR}/build_status/{{date}}.csv"
#  shell:
#    f"""
#    {ANALYSIS_WRAPPER} files {ANALYSIS_SCRIPTS_DIR}/buildstatus_analysis.py {{output}} {{input}}
#    """
#
#rule artifact_analysis:
#  wildcard_constraints:
#    date="\d+"
#  input:
#    expand(f"{PREFIX}/artifact_hash/{{artifact}}/{{{{date}}}}.csv",
#      artifact = ARTIFACTS
#    )
#  output:
#    f"{ANALYSIS_DIR}/artifact/{{date}}.csv"
#  shell:
#    f"""
#    {ANALYSIS_WRAPPER} dirs {ANALYSIS_SCRIPTS_DIR}/artifact_analysis.py {{output}} {ARTIFACT_ANALYSIS_DIRS}
#    """
#
## Analysis aggregate:
#
#rule analysis_aggregate:
#  input:
#    expand(f"{ANALYSIS_DIR}/{{cat}}/aggregated/{{date}}.csv",
#      cat = ANALYSIS_CATS,
#      date = DATE
#    )
#
#rule aggregate_by_type:
#  input:
#    data=f"{ANALYSIS_DIR}/{{type}}/{{date}}.csv",
#    script="workflow/scripts/aggregate_wrapper.sh"
#  output:
#    f"{ANALYSIS_DIR}/{{type}}/aggregated/{{date}}.csv"
#  shell:
#    f"{{input.script}} {ANALYSIS_DIR}/{{type}} {{output}}"
#
## Plot:
#
#rule plot:
#  input:
#    script = "plot/plot.r",
#    data = f"{ANALYSIS_DIR}/{{type}}/aggregated/{{date}}.csv",
#  output:
#    f"{ANALYSIS_DIR}/{{type}}/{{plot}}/{{date}}.pdf"
#  params:
#    header = lambda w: PLOT_HEADERS[w.type]
#  shell:
#    "Rscript {input.script} {wildcards.plot} {input.data} {output} {params.header} timestamp"