checkpoint in case of reaching walltime

This commit is contained in:
Quentin Guilloteau 2024-07-21 17:32:04 +02:00
parent ef257e6026
commit 944b1bf6f9
4 changed files with 18 additions and 11 deletions

5
ecg.py
View File

@ -164,6 +164,7 @@ def buildstatus_saver(output, buildstatus_path):
""" """
file_exists = os.path.exists(buildstatus_path) file_exists = os.path.exists(buildstatus_path)
buildstatus_file = open(buildstatus_path, "a") buildstatus_file = open(buildstatus_path, "a")
artifact_name = os.path.basename(config_path).split(".")[0]
# # Writing header in case file didn't exist: # # Writing header in case file didn't exist:
# if not file_exists: # if not file_exists:
# buildstatus_file.write("yaml_path,timestamp,error") # buildstatus_file.write("yaml_path,timestamp,error")
@ -173,12 +174,12 @@ def buildstatus_saver(output, buildstatus_path):
unknown_error = False unknown_error = False
now = datetime.datetime.now() now = datetime.datetime.now()
timestamp = str(datetime.datetime.timestamp(now)) timestamp = str(datetime.datetime.timestamp(now))
buildstatus_file.write(f"{config_path},{timestamp},{error_cat}\n") buildstatus_file.write(f"{artifact_name},{timestamp},{error_cat}\n")
print(unknown_error) print(unknown_error)
if unknown_error: if unknown_error:
now = datetime.datetime.now() now = datetime.datetime.now()
timestamp = str(datetime.datetime.timestamp(now)) timestamp = str(datetime.datetime.timestamp(now))
buildstatus_file.write(f"{config_path},{timestamp},unknown_error\n") buildstatus_file.write(f"{artifact_name},{timestamp},unknown_error\n")
buildstatus_file.close() buildstatus_file.close()
def build_image(config, src_dir): def build_image(config, src_dir):

View File

@ -50,7 +50,7 @@ rule check_artifact:
SHELLS_ECG = { SHELLS_ECG = {
"local": f"python3 {{input.ecg}} -l {{output.log}} -p {{output.pkg}} -b {{output.build_status}} -a {{output.artifact_hash}} {ARTIFACTS_FOLDER_JSON}/{{wildcards.artifact}}.{EXTENSION}", "local": f"python3 {{input.ecg}} -l {{output.log}} -p {{output.pkg}} -b {{output.build_status}} -a {{output.artifact_hash}} {ARTIFACTS_FOLDER_JSON}/{{wildcards.artifact}}.{EXTENSION}",
"g5k": f"python3 {{input.execo_wrapper}} --path {os.getcwd()} --script {{input.oar_wrapper}} --site {config['site']} --cluster {config['cluster']} --max-duration {config['max_duration']} --checkpoint {config['checkpoint']} {'--besteffort' if config['besteffort'] else ''} --sleep_time {config['sleep_time']} -- '" "g5k": f"python3 {{input.execo_wrapper}} --path {os.getcwd()} --script {{input.oar_wrapper}} --site {config['site']} --cluster {config['cluster']} --max-duration {config['max_duration']} --checkpoint {config['checkpoint']} {'--besteffort' if config['besteffort'] else ''} --sleep_time {config['sleep_time']} --build_status_file {{output.build_status}} --artifact {{wildcards.artifact}} -- '"
} }
rule run_ecg: rule run_ecg:

View File

@ -2,6 +2,13 @@
set -xe set -xe
DIRECTORY=$1
shift
BUILD_STATUS_FILE=$1
shift
ARTIFACT_FILE=$1
shift
# To "activate" nix on the node # To "activate" nix on the node
export PATH=~/.local/bin:$PATH export PATH=~/.local/bin:$PATH
@ -10,12 +17,9 @@ export PATH=~/.local/bin:$PATH
g5k-setup-docker -t g5k-setup-docker -t
handler() { handler() {
echo "Caught checkpoint signal at: `date`"; echo "Terminating."; exit 0; echo "${ARTIFACT_FILE}, `date +%s.%N`, exceeded_time" >> ${BUILD_STATUS_FILE}; exit 0;
} }
trap handler SIGUSR2 trap handler SIGUSR2
cd $1 cd ${DIRECTORY}
shift
nix develop --command $@ nix develop --command $@

View File

@ -2,7 +2,7 @@ from execo_g5k import oardel, oarsub, OarSubmission, wait_oar_job_start, get_oar
import time import time
import argparse import argparse
def submit_job(cluster, site, maximum_duration_minutes, checkpoint_minutes, is_besteffort, path, script, command): def submit_job(cluster, site, maximum_duration_minutes, checkpoint_minutes, is_besteffort, path, script, command, build_status_file, artifact):
reservation_duration = (maximum_duration_minutes + checkpoint_minutes) * 60 reservation_duration = (maximum_duration_minutes + checkpoint_minutes) * 60
checkpoint = checkpoint_minutes * 60 checkpoint = checkpoint_minutes * 60
job_type = [] job_type = []
@ -13,7 +13,7 @@ def submit_job(cluster, site, maximum_duration_minutes, checkpoint_minutes, is_b
reservation_duration,\ reservation_duration,\
job_type=job_type,\ job_type=job_type,\
additional_options=f"--checkpoint {checkpoint}",\ additional_options=f"--checkpoint {checkpoint}",\
command=f"{path}/{script} {path} {command}"), site)])[0] command=f"{path}/{script} {path} {build_status_file} {artifact} {command}"), site)])[0]
return oar_job_id return oar_job_id
def wait_for_completion(oar_job_id, site, sleep_time): def wait_for_completion(oar_job_id, site, sleep_time):
@ -33,11 +33,13 @@ def main():
parser.add_argument("--path", required=True, help="Root of the project") parser.add_argument("--path", required=True, help="Root of the project")
parser.add_argument("--script", required=True, help="Path of the bash script to oarsub relative to the '--path'") parser.add_argument("--script", required=True, help="Path of the bash script to oarsub relative to the '--path'")
parser.add_argument("--sleep_time", required=False, type=int, default=60, help="Time interval in seconds to check the termination of the job") parser.add_argument("--sleep_time", required=False, type=int, default=60, help="Time interval in seconds to check the termination of the job")
parser.add_argument("--build_status_file", required=True, help="File to write the build status to in the case of time exceeding")
parser.add_argument("--artifact", required=True, help="Name of the artifact")
parser.add_argument("command", help="ECG Command") parser.add_argument("command", help="ECG Command")
args = parser.parse_args() args = parser.parse_args()
oar_job_id = submit_job(args.cluster, args.site, args.max_duration, args.checkpoint, args.besteffort, args.path, args.script, args.command) oar_job_id = submit_job(args.cluster, args.site, args.max_duration, args.checkpoint, args.besteffort, args.path, args.script, args.command, args.build_status_file, args.artifact)
wait_oar_job_start(oar_job_id, args.site) wait_oar_job_start(oar_job_id, args.site)