checkpoint in case of reaching walltime
This commit is contained in:
parent
ef257e6026
commit
944b1bf6f9
5
ecg.py
5
ecg.py
@ -164,6 +164,7 @@ def buildstatus_saver(output, buildstatus_path):
|
|||||||
"""
|
"""
|
||||||
file_exists = os.path.exists(buildstatus_path)
|
file_exists = os.path.exists(buildstatus_path)
|
||||||
buildstatus_file = open(buildstatus_path, "a")
|
buildstatus_file = open(buildstatus_path, "a")
|
||||||
|
artifact_name = os.path.basename(config_path).split(".")[0]
|
||||||
# # Writing header in case file didn't exist:
|
# # Writing header in case file didn't exist:
|
||||||
# if not file_exists:
|
# if not file_exists:
|
||||||
# buildstatus_file.write("yaml_path,timestamp,error")
|
# buildstatus_file.write("yaml_path,timestamp,error")
|
||||||
@ -173,12 +174,12 @@ def buildstatus_saver(output, buildstatus_path):
|
|||||||
unknown_error = False
|
unknown_error = False
|
||||||
now = datetime.datetime.now()
|
now = datetime.datetime.now()
|
||||||
timestamp = str(datetime.datetime.timestamp(now))
|
timestamp = str(datetime.datetime.timestamp(now))
|
||||||
buildstatus_file.write(f"{config_path},{timestamp},{error_cat}\n")
|
buildstatus_file.write(f"{artifact_name},{timestamp},{error_cat}\n")
|
||||||
print(unknown_error)
|
print(unknown_error)
|
||||||
if unknown_error:
|
if unknown_error:
|
||||||
now = datetime.datetime.now()
|
now = datetime.datetime.now()
|
||||||
timestamp = str(datetime.datetime.timestamp(now))
|
timestamp = str(datetime.datetime.timestamp(now))
|
||||||
buildstatus_file.write(f"{config_path},{timestamp},unknown_error\n")
|
buildstatus_file.write(f"{artifact_name},{timestamp},unknown_error\n")
|
||||||
buildstatus_file.close()
|
buildstatus_file.close()
|
||||||
|
|
||||||
def build_image(config, src_dir):
|
def build_image(config, src_dir):
|
||||||
|
@ -50,7 +50,7 @@ rule check_artifact:
|
|||||||
|
|
||||||
SHELLS_ECG = {
|
SHELLS_ECG = {
|
||||||
"local": f"python3 {{input.ecg}} -l {{output.log}} -p {{output.pkg}} -b {{output.build_status}} -a {{output.artifact_hash}} {ARTIFACTS_FOLDER_JSON}/{{wildcards.artifact}}.{EXTENSION}",
|
"local": f"python3 {{input.ecg}} -l {{output.log}} -p {{output.pkg}} -b {{output.build_status}} -a {{output.artifact_hash}} {ARTIFACTS_FOLDER_JSON}/{{wildcards.artifact}}.{EXTENSION}",
|
||||||
"g5k": f"python3 {{input.execo_wrapper}} --path {os.getcwd()} --script {{input.oar_wrapper}} --site {config['site']} --cluster {config['cluster']} --max-duration {config['max_duration']} --checkpoint {config['checkpoint']} {'--besteffort' if config['besteffort'] else ''} --sleep_time {config['sleep_time']} -- '"
|
"g5k": f"python3 {{input.execo_wrapper}} --path {os.getcwd()} --script {{input.oar_wrapper}} --site {config['site']} --cluster {config['cluster']} --max-duration {config['max_duration']} --checkpoint {config['checkpoint']} {'--besteffort' if config['besteffort'] else ''} --sleep_time {config['sleep_time']} --build_status_file {{output.build_status}} --artifact {{wildcards.artifact}} -- '"
|
||||||
}
|
}
|
||||||
|
|
||||||
rule run_ecg:
|
rule run_ecg:
|
||||||
|
@ -2,6 +2,13 @@
|
|||||||
|
|
||||||
set -xe
|
set -xe
|
||||||
|
|
||||||
|
DIRECTORY=$1
|
||||||
|
shift
|
||||||
|
BUILD_STATUS_FILE=$1
|
||||||
|
shift
|
||||||
|
ARTIFACT_FILE=$1
|
||||||
|
shift
|
||||||
|
|
||||||
# To "activate" nix on the node
|
# To "activate" nix on the node
|
||||||
export PATH=~/.local/bin:$PATH
|
export PATH=~/.local/bin:$PATH
|
||||||
|
|
||||||
@ -10,12 +17,9 @@ export PATH=~/.local/bin:$PATH
|
|||||||
g5k-setup-docker -t
|
g5k-setup-docker -t
|
||||||
|
|
||||||
handler() {
|
handler() {
|
||||||
echo "Caught checkpoint signal at: `date`"; echo "Terminating."; exit 0;
|
echo "${ARTIFACT_FILE}, `date +%s.%N`, exceeded_time" >> ${BUILD_STATUS_FILE}; exit 0;
|
||||||
}
|
}
|
||||||
trap handler SIGUSR2
|
trap handler SIGUSR2
|
||||||
|
|
||||||
cd $1
|
cd ${DIRECTORY}
|
||||||
|
|
||||||
shift
|
|
||||||
|
|
||||||
nix develop --command $@
|
nix develop --command $@
|
||||||
|
@ -2,7 +2,7 @@ from execo_g5k import oardel, oarsub, OarSubmission, wait_oar_job_start, get_oar
|
|||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
def submit_job(cluster, site, maximum_duration_minutes, checkpoint_minutes, is_besteffort, path, script, command):
|
def submit_job(cluster, site, maximum_duration_minutes, checkpoint_minutes, is_besteffort, path, script, command, build_status_file, artifact):
|
||||||
reservation_duration = (maximum_duration_minutes + checkpoint_minutes) * 60
|
reservation_duration = (maximum_duration_minutes + checkpoint_minutes) * 60
|
||||||
checkpoint = checkpoint_minutes * 60
|
checkpoint = checkpoint_minutes * 60
|
||||||
job_type = []
|
job_type = []
|
||||||
@ -13,7 +13,7 @@ def submit_job(cluster, site, maximum_duration_minutes, checkpoint_minutes, is_b
|
|||||||
reservation_duration,\
|
reservation_duration,\
|
||||||
job_type=job_type,\
|
job_type=job_type,\
|
||||||
additional_options=f"--checkpoint {checkpoint}",\
|
additional_options=f"--checkpoint {checkpoint}",\
|
||||||
command=f"{path}/{script} {path} {command}"), site)])[0]
|
command=f"{path}/{script} {path} {build_status_file} {artifact} {command}"), site)])[0]
|
||||||
return oar_job_id
|
return oar_job_id
|
||||||
|
|
||||||
def wait_for_completion(oar_job_id, site, sleep_time):
|
def wait_for_completion(oar_job_id, site, sleep_time):
|
||||||
@ -33,11 +33,13 @@ def main():
|
|||||||
parser.add_argument("--path", required=True, help="Root of the project")
|
parser.add_argument("--path", required=True, help="Root of the project")
|
||||||
parser.add_argument("--script", required=True, help="Path of the bash script to oarsub relative to the '--path'")
|
parser.add_argument("--script", required=True, help="Path of the bash script to oarsub relative to the '--path'")
|
||||||
parser.add_argument("--sleep_time", required=False, type=int, default=60, help="Time interval in seconds to check the termination of the job")
|
parser.add_argument("--sleep_time", required=False, type=int, default=60, help="Time interval in seconds to check the termination of the job")
|
||||||
|
parser.add_argument("--build_status_file", required=True, help="File to write the build status to in the case of time exceeding")
|
||||||
|
parser.add_argument("--artifact", required=True, help="Name of the artifact")
|
||||||
parser.add_argument("command", help="ECG Command")
|
parser.add_argument("command", help="ECG Command")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
oar_job_id = submit_job(args.cluster, args.site, args.max_duration, args.checkpoint, args.besteffort, args.path, args.script, args.command)
|
oar_job_id = submit_job(args.cluster, args.site, args.max_duration, args.checkpoint, args.besteffort, args.path, args.script, args.command, args.build_status_file, args.artifact)
|
||||||
|
|
||||||
wait_oar_job_start(oar_job_id, args.site)
|
wait_oar_job_start(oar_job_id, args.site)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user