checkpoint in case of reaching walltime
This commit is contained in:
parent
ef257e6026
commit
944b1bf6f9
5
ecg.py
5
ecg.py
@ -164,6 +164,7 @@ def buildstatus_saver(output, buildstatus_path):
|
||||
"""
|
||||
file_exists = os.path.exists(buildstatus_path)
|
||||
buildstatus_file = open(buildstatus_path, "a")
|
||||
artifact_name = os.path.basename(config_path).split(".")[0]
|
||||
# # Writing header in case file didn't exist:
|
||||
# if not file_exists:
|
||||
# buildstatus_file.write("yaml_path,timestamp,error")
|
||||
@ -173,12 +174,12 @@ def buildstatus_saver(output, buildstatus_path):
|
||||
unknown_error = False
|
||||
now = datetime.datetime.now()
|
||||
timestamp = str(datetime.datetime.timestamp(now))
|
||||
buildstatus_file.write(f"{config_path},{timestamp},{error_cat}\n")
|
||||
buildstatus_file.write(f"{artifact_name},{timestamp},{error_cat}\n")
|
||||
print(unknown_error)
|
||||
if unknown_error:
|
||||
now = datetime.datetime.now()
|
||||
timestamp = str(datetime.datetime.timestamp(now))
|
||||
buildstatus_file.write(f"{config_path},{timestamp},unknown_error\n")
|
||||
buildstatus_file.write(f"{artifact_name},{timestamp},unknown_error\n")
|
||||
buildstatus_file.close()
|
||||
|
||||
def build_image(config, src_dir):
|
||||
|
@ -50,7 +50,7 @@ rule check_artifact:
|
||||
|
||||
SHELLS_ECG = {
|
||||
"local": f"python3 {{input.ecg}} -l {{output.log}} -p {{output.pkg}} -b {{output.build_status}} -a {{output.artifact_hash}} {ARTIFACTS_FOLDER_JSON}/{{wildcards.artifact}}.{EXTENSION}",
|
||||
"g5k": f"python3 {{input.execo_wrapper}} --path {os.getcwd()} --script {{input.oar_wrapper}} --site {config['site']} --cluster {config['cluster']} --max-duration {config['max_duration']} --checkpoint {config['checkpoint']} {'--besteffort' if config['besteffort'] else ''} --sleep_time {config['sleep_time']} -- '"
|
||||
"g5k": f"python3 {{input.execo_wrapper}} --path {os.getcwd()} --script {{input.oar_wrapper}} --site {config['site']} --cluster {config['cluster']} --max-duration {config['max_duration']} --checkpoint {config['checkpoint']} {'--besteffort' if config['besteffort'] else ''} --sleep_time {config['sleep_time']} --build_status_file {{output.build_status}} --artifact {{wildcards.artifact}} -- '"
|
||||
}
|
||||
|
||||
rule run_ecg:
|
||||
|
@ -2,6 +2,13 @@
|
||||
|
||||
set -xe
|
||||
|
||||
DIRECTORY=$1
|
||||
shift
|
||||
BUILD_STATUS_FILE=$1
|
||||
shift
|
||||
ARTIFACT_FILE=$1
|
||||
shift
|
||||
|
||||
# To "activate" nix on the node
|
||||
export PATH=~/.local/bin:$PATH
|
||||
|
||||
@ -10,12 +17,9 @@ export PATH=~/.local/bin:$PATH
|
||||
g5k-setup-docker -t
|
||||
|
||||
handler() {
|
||||
echo "Caught checkpoint signal at: `date`"; echo "Terminating."; exit 0;
|
||||
echo "${ARTIFACT_FILE}, `date +%s.%N`, exceeded_time" >> ${BUILD_STATUS_FILE}; exit 0;
|
||||
}
|
||||
trap handler SIGUSR2
|
||||
|
||||
cd $1
|
||||
|
||||
shift
|
||||
|
||||
cd ${DIRECTORY}
|
||||
nix develop --command $@
|
||||
|
@ -2,7 +2,7 @@ from execo_g5k import oardel, oarsub, OarSubmission, wait_oar_job_start, get_oar
|
||||
import time
|
||||
import argparse
|
||||
|
||||
def submit_job(cluster, site, maximum_duration_minutes, checkpoint_minutes, is_besteffort, path, script, command):
|
||||
def submit_job(cluster, site, maximum_duration_minutes, checkpoint_minutes, is_besteffort, path, script, command, build_status_file, artifact):
|
||||
reservation_duration = (maximum_duration_minutes + checkpoint_minutes) * 60
|
||||
checkpoint = checkpoint_minutes * 60
|
||||
job_type = []
|
||||
@ -13,7 +13,7 @@ def submit_job(cluster, site, maximum_duration_minutes, checkpoint_minutes, is_b
|
||||
reservation_duration,\
|
||||
job_type=job_type,\
|
||||
additional_options=f"--checkpoint {checkpoint}",\
|
||||
command=f"{path}/{script} {path} {command}"), site)])[0]
|
||||
command=f"{path}/{script} {path} {build_status_file} {artifact} {command}"), site)])[0]
|
||||
return oar_job_id
|
||||
|
||||
def wait_for_completion(oar_job_id, site, sleep_time):
|
||||
@ -33,11 +33,13 @@ def main():
|
||||
parser.add_argument("--path", required=True, help="Root of the project")
|
||||
parser.add_argument("--script", required=True, help="Path of the bash script to oarsub relative to the '--path'")
|
||||
parser.add_argument("--sleep_time", required=False, type=int, default=60, help="Time interval in seconds to check the termination of the job")
|
||||
parser.add_argument("--build_status_file", required=True, help="File to write the build status to in the case of time exceeding")
|
||||
parser.add_argument("--artifact", required=True, help="Name of the artifact")
|
||||
parser.add_argument("command", help="ECG Command")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
oar_job_id = submit_job(args.cluster, args.site, args.max_duration, args.checkpoint, args.besteffort, args.path, args.script, args.command)
|
||||
oar_job_id = submit_job(args.cluster, args.site, args.max_duration, args.checkpoint, args.besteffort, args.path, args.script, args.command, args.build_status_file, args.artifact)
|
||||
|
||||
wait_oar_job_start(oar_job_id, args.site)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user