diff --git a/ci/cases/C96C48_hybatmDA.yaml b/ci/cases/C96C48_hybatmDA.yaml index 9efce409009ea29f4ab8c7590edf3c9dab73f1ab..ebfda04fff91cb72de331a9f7751fea19f892bd8 100644 --- a/ci/cases/C96C48_hybatmDA.yaml +++ b/ci/cases/C96C48_hybatmDA.yaml @@ -1,4 +1,5 @@ experiment: + type: gfs mode: cycled arguments: diff --git a/ci/cases/C96_atm3DVar.yaml b/ci/cases/C96_atm3DVar.yaml index 1648432e09113cd1272713622da6800a613748f7..ca0e3fda6e2fb8da642a2c38c328412f734e069e 100644 --- a/ci/cases/C96_atm3DVar.yaml +++ b/ci/cases/C96_atm3DVar.yaml @@ -1,4 +1,5 @@ experiment: + type: gfs mode: cycled arguments: diff --git a/ci/platforms/hera.sh b/ci/platforms/hera.sh index 35fe7bca91291341de730394d607e32aa9abfb2c..a999748b1fec9557a12887071c0e73e06703e711 100644 --- a/ci/platforms/hera.sh +++ b/ci/platforms/hera.sh @@ -1,7 +1,6 @@ #!/usr/bin/bash export GFS_CI_ROOT=/scratch1/NCEPDEV/global/Terry.McGuinness/GFS_CI_ROOT -export SLURM_ACCOUNT=fv3-cpu -export SALLOC_ACCOUNT="${SLURM_ACCOUNT}" -export SBATCH_ACCOUNT="${SLURM_ACCOUNT}" -export SLURM_QOS=debug +export SLURM_ACCOUNT=nems export ICSDIR_ROOT="/scratch1/NCEPDEV/global/glopara/data/ICSDIR" +export max_concurrent_cases=2 +export max_concurrent_pr=2 diff --git a/ci/platforms/orion.sh b/ci/platforms/orion.sh index 7d69a3b276ee5373ec363f1b4fb453a6593a0118..329fc5aab9c5db5450ccd68abd10f3247e9bde81 100644 --- a/ci/platforms/orion.sh +++ b/ci/platforms/orion.sh @@ -2,10 +2,6 @@ export GFS_CI_ROOT=/work2/noaa/global/mterry/GFS_CI_ROOT export ICSDIR_ROOT=/work/noaa/global/glopara/data/ICSDIR -export SLURM_ACCOUNT=fv3-cpu -export SALLOC_ACCOUNT=${SLURM_ACCOUNT} -export SBATCH_ACCOUNT=${SLURM_ACCOUNT} -export SLURM_QOS=debug -export SLURM_EXCLUSIVE=user -export OMP_NUM_THREADS=1 -ulimit -s unlimited +export SLURM_ACCOUNT=nems +export max_concurrent_cases=2 +export max_concurrent_pr=2 diff --git a/ci/scripts/check_ci.sh b/ci/scripts/check_ci.sh index aa48e9f89462a04a25573c80c834aef9dc440d50..20df09d851b6ac410d758399a07dcf6b43c8a3a1 100755 --- a/ci/scripts/check_ci.sh +++ b/ci/scripts/check_ci.sh @@ -38,20 +38,29 @@ module use "${HOMEgfs}/modulefiles" module load "module_gwsetup.${MACHINE_ID}" module list set -x -rocotostat=$(which rocotostat) +rocotostat=$(command -v rocotostat) if [[ -z ${rocotostat+x} ]]; then echo "rocotostat not found on system" exit 1 else echo "rocotostat being used from ${rocotostat}" fi +rocotocheck=$(command -v rocotocheck) +if [[ -z ${rocotocheck+x} ]]; then + echo "rocotocheck not found on system" + exit 1 +else + echo "rocotocheck being used from ${rocotocheck}" +fi -pr_list_file="open_pr_list" +pr_list_dbfile="${GFS_CI_ROOT}/open_pr_list.db" -if [[ -s "${GFS_CI_ROOT}/${pr_list_file}" ]]; then - pr_list=$(cat "${GFS_CI_ROOT}/${pr_list_file}") -else - echo "no PRs to process .. exit" +pr_list="" +if [[ -f "${pr_list_dbfile}" ]]; then + pr_list=$("${HOMEgfs}/ci/scripts/pr_list_database.py" --display "${pr_list_dbfile}" | grep -v Failed | grep Running | awk '{print $1}') || true +fi +if [[ -z "${pr_list}" ]]; then + echo "no PRs open and ready to run cases on .. exiting" exit 0 fi @@ -76,14 +85,18 @@ for pr in ${pr_list}; do if [[ "${num_cases}" -eq 0 ]] && [[ -d "${pr_dir}/RUNTESTS" ]]; then "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Running" --add-label "CI-${MACHINE_ID^}-Passed" "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${GFS_CI_ROOT}/PR/${pr}/output_${id}" - sed -i "/${pr}/d" "${GFS_CI_ROOT}/${pr_list_file}" + "${HOMEgfs}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" "${pr_list_dbfile}" # Completely remove the PR and its cloned repo on sucess of all cases rm -Rf "${pr_dir}" continue fi for cases in "${pr_dir}/RUNTESTS/"*; do - pslot=$(basename "${cases}") + pslot=$(basename "${cases}") || true + if [[ -z "${pslot}" ]]; then + echo "No cases found in ${pr_dir}/RUNTESTS .. exiting" + exit 0 + fi xml="${pr_dir}/RUNTESTS/${pslot}/EXPDIR/${pslot}/${pslot}.xml" db="${pr_dir}/RUNTESTS/${pslot}/EXPDIR/${pslot}/${pslot}.db" rocoto_stat_output=$("${rocotostat}" -w "${xml}" -d "${db}" -s | grep -v CYCLE) || true @@ -97,15 +110,25 @@ for pr in ${pr_list}; do echo "Experiment ${pslot} Terminated: *FAILED*" echo "Experiment ${pslot} Terminated with ${num_failed} tasks failed at $(date)" || true } >> "${GFS_CI_ROOT}/PR/${pr}/output_${id}" + error_logs=$("${rocotostat}" -d "${db}" -w "${xml}" | grep -E 'FAIL|DEAD' | awk '{print "-c", $1, "-t", $2}' | xargs "${rocotocheck}" -d "${db}" -w "${xml}" | grep join | awk '{print $2}') || true "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Running" --add-label "CI-${MACHINE_ID^}-Failed" + { + echo "Error logs:" + echo "${error_logs}" + } >> "${GFS_CI_ROOT}/PR/${pr}/output_${id}" "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${GFS_CI_ROOT}/PR/${pr}/output_${id}" - sed -i "/${pr}/d" "${GFS_CI_ROOT}/${pr_list_file}" + "${HOMEgfs}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" "${pr_list_dbfile}" + for kill_cases in "${pr_dir}/RUNTESTS/"*; do + pslot=$(basename "${kill_cases}") + sacct --format=jobid,jobname%35,WorkDir%100,stat | grep "${pslot}" | grep "PR\/${pr}\/RUNTESTS" | awk '{print $1}' | xargs scancel || true + done + break fi if [[ "${num_done}" -eq "${num_cycles}" ]]; then { echo "Experiment ${pslot} completed: *SUCCESS*" echo "Experiment ${pslot} Completed at $(date)" || true - echo -n "with ${num_succeeded} successfully completed jobs" || true + echo "with ${num_succeeded} successfully completed jobs" || true } >> "${GFS_CI_ROOT}/PR/${pr}/output_${id}" "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${GFS_CI_ROOT}/PR/${pr}/output_${id}" #Remove Experment cases that completed successfully diff --git a/ci/scripts/create_experiment.py b/ci/scripts/create_experiment.py index ce95714d486b9bf1510769f40a905d2ee1dea5be..4500e91feb5b9fed53b226d84b58dbc334daca22 100755 --- a/ci/scripts/create_experiment.py +++ b/ci/scripts/create_experiment.py @@ -61,7 +61,6 @@ def input_args(): args: Namespace Namespace with the value of the file path to a yaml file from the key yaml -:w """ description = """Single argument as a yaml file containing the @@ -85,9 +84,12 @@ if __name__ == '__main__': HOMEgfs = user_inputs.dir pslot = Path(user_inputs.yaml).stem + type = setup_expt_args.experiment.type mode = setup_expt_args.experiment.mode setup_expt_cmd = Executable(Path.absolute(Path.joinpath(Path(HOMEgfs), 'workflow', 'setup_expt.py'))) + + setup_expt_cmd.add_default_arg(type) setup_expt_cmd.add_default_arg(mode) for conf, value in setup_expt_args.arguments.items(): @@ -98,11 +100,11 @@ if __name__ == '__main__': setup_expt_cmd.add_default_arg(pslot) logger.info(f'Run command: {setup_expt_cmd.command}') - setup_expt_cmd(output='stdout_expt', error='stderr_expt') + setup_expt_cmd(output='setup_expt.stdout', error='setup_expt.stderr') setup_xml_cmd = Executable(Path.absolute(Path.joinpath(Path(HOMEgfs), 'workflow', 'setup_xml.py'))) expdir = Path.absolute(Path.joinpath(Path(setup_expt_args.arguments.expdir), Path(pslot))) setup_xml_cmd.add_default_arg(str(expdir)) logger.info(f'Run command: {setup_xml_cmd.command}') - setup_xml_cmd(output='stdout_setupxml', error='stderr_setupxml') + setup_xml_cmd(output='setupxml.stdout', error='setupxml.stderr') diff --git a/ci/scripts/driver.sh b/ci/scripts/driver.sh index 0bd90db36c4221549f905a45cfae2b7ea15c504c..6bd76ca2bcc2e2970230dc32cc0ba2d4002d5ae1 100755 --- a/ci/scripts/driver.sh +++ b/ci/scripts/driver.sh @@ -57,17 +57,26 @@ set -x ############################################################ # query repo and get list of open PRs with tags {machine}-CI ############################################################ -pr_list_file="open_pr_list" -touch "${GFS_CI_ROOT}/${pr_list_file}" -list=$(${GH} pr list --repo "${REPO_URL}" --label "CI-${MACHINE_ID^}-Ready" --state "open") -list=$(echo "${list}" | awk '{print $1;}' >> "${GFS_CI_ROOT}/${pr_list_file}") - -if [[ -s "${GFS_CI_ROOT}/${pr_list_file}" ]]; then - pr_list=$(cat "${GFS_CI_ROOT}/${pr_list_file}") -else - echo "no PRs to process .. exit" - exit 0 -fi +pr_list_dbfile="${GFS_CI_ROOT}/open_pr_list.db" +if [[ ! -f "${pr_list_dbfile}" ]]; then + "${HOMEgfs}/ci/scripts/pr_list_database.py" --create "${pr_list_dbfile}" +fi + +pr_list=$(${GH} pr list --repo "${REPO_URL}" --label "CI-${MACHINE_ID^}-Ready" --state "open" | awk '{print $1}') || true + +for pr in ${pr_list}; do + "${HOMEgfs}/ci/scripts/pr_list_database.py" --add_pr "${pr}" "${pr_list_dbfile}" +done + +pr_list="" +if [[ -f "${pr_list_dbfile}" ]]; then + pr_list=$("${HOMEgfs}/ci/scripts/pr_list_database.py" --display "${pr_list_dbfile}" | grep -v Failed | grep Open | grep Ready | awk '{print $1}') || true +fi +if [[ -z "${pr_list}" ]]; then + echo "no PRs open and ready for checkout/build .. exiting" + exit 0 +fi + ############################################################# # Loop throu all open PRs @@ -87,10 +96,11 @@ for pr in ${pr_list}; do ci_status=$? set -e if [[ ${ci_status} -eq 0 ]]; then + "${HOMEgfs}/ci/scripts/pr_list_database.py" --update_pr "${pr}" Open Built "${pr_list_dbfile}" #setup space to put an experiment # export RUNTESTS for yaml case files to pickup export RUNTESTS="${pr_dir}/RUNTESTS" - rm -Rf "${pr_dir:?}/RUNTESTS/"* + #rm -Rf "${pr_dir:?}/RUNTESTS/"* ############################################################# # loop over every yaml file in ${HOMEgfs}/ci/cases @@ -99,6 +109,8 @@ for pr in ${pr_list}; do for yaml_config in "${HOMEgfs}/ci/cases/"*.yaml; do pslot=$(basename "${yaml_config}" .yaml) || true export pslot + sed -i "/^base:/a\ ACCOUNT: \${SLURM_ACCOUNT}" "${pr_dir}/global-workflow/parm/config/gfs/yaml/defaults.yaml" + sed -i "/^base:/a\ ACCOUNT: \${SLURM_ACCOUNT}" "${pr_dir}/global-workflow/parm/config/gefs/yaml/defaults.yaml" set +e "${HOMEgfs}/ci/scripts/create_experiment.py" --yaml "${HOMEgfs}/ci/cases/${pslot}.yaml" --dir "${pr_dir}/global-workflow" ci_status=$? @@ -109,12 +121,14 @@ for pr in ${pr_list}; do echo "Case setup: Completed at $(date) for experiment ${pslot}" || true } >> "${GFS_CI_ROOT}/PR/${pr}/output_${id}" "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Building" --add-label "CI-${MACHINE_ID^}-Running" + "${HOMEgfs}/ci/scripts/pr_list_database.py" --update_pr "${pr}" Open Running "${pr_list_dbfile}" else { echo "Failed to create experiment}: *FAIL* ${pslot}" echo "Experiment setup: failed at $(date) for experiment ${pslot}" || true } >> "${GFS_CI_ROOT}/PR/${pr}/output_${id}" "${GH}" pr edit "${pr}" --repo "${REPO_URL}" --remove-label "CI-${MACHINE_ID^}-Building" --add-label "CI-${MACHINE_ID^}-Failed" + "${HOMEgfs}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" "${pr_list_dbfile}" fi done @@ -124,6 +138,7 @@ for pr in ${pr_list}; do echo "CI on ${MACHINE_ID^} failed to build on $(date) for repo ${REPO_URL}}" || true } >> "${GFS_CI_ROOT}/PR/${pr}/output_${id}" "${GH}" pr edit "${pr}" --repo "${REPO_URL}" --remove-label "CI-${MACHINE_ID^}-Building" --add-label "CI-${MACHINE_ID^}-Failed" + "${HOMEgfs}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" "${pr_list_dbfile}" fi "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${GFS_CI_ROOT}/PR/${pr}/output_${id}" diff --git a/ci/scripts/pr_list_database.py b/ci/scripts/pr_list_database.py new file mode 100755 index 0000000000000000000000000000000000000000..b2bc1bc23dc4f1a13af06b5980847c483166336b --- /dev/null +++ b/ci/scripts/pr_list_database.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 + +import sys +from pathlib import Path +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter +import sqlite3 + + +def sql_connection(filename: Path) -> sqlite3.Connection: + """ + Returns an Sqlite3 Cursor object from a given path to a sqlite3 database file + + Parameters + ---------- + filename : Path + Full path to a sqlite3 database file + + Returns + ------- + sqlite3.Connection + Sqlite3 Connection object for updating table + + """ + try: + return sqlite3.connect(Path(filename)) + except sqlite3.Error: + print(sqlite3.Error) + sys.exit(-1) + + +def sql_table(obj: sqlite3.Cursor) -> None: + """ + Creates the initial sqlite3 table for PR states and status + + Parameters + ---------- + obj : sqlite3.Cursor + Cursor object for Sqlite3 + + """ + + obj.execute("CREATE TABLE processing(pr integer PRIMARY KEY, state text, status text)") + + +def sql_insert(obj: sqlite3.Cursor, entities: list) -> None: + """ + Inserts a new row in sqlite3 table with PR, state, and status + + Parameters + ---------- + obj : sqlite3.Cursor + Cursor object for Sqlite3 + entities : list + The list three string values that go into sqlite table (pr, state, status) + + """ + + obj.execute('INSERT INTO processing(pr, state, status) VALUES(?, ?, ?)', entities) + + +def sql_update(obj: sqlite3.Cursor, pr: str, state: str, status: str) -> None: + """Updates table for a given pr with new values for state and status + + Parameters + ---------- + obj : sqlite.sql_connection + sqlite3 Cursor Object + pr : str + The given pr number to update in the table + state : str + The new value for the state (Open, Closed) + status: str + The new value for the status (Ready, Running, Failed) + + """ + + obj.execute(f'UPDATE processing SET state = "{state}", status = "{status}" WHERE pr = {pr}') + + +def sql_fetch(obj: sqlite3.Cursor) -> list: + """ Gets list of all rows in table + + Parameters + ---------- + obj : sqlite.sql_connection + sqlite3 Cursor Object + + """ + + obj.execute('SELECT * FROM processing') + return obj.fetchall() + + +def sql_remove(obj: sqlite3.Cursor, pr: str) -> None: + """ Removes the row from table with given pr number + + Parameters + ---------- + obj : sqlite.sql_connection + sqlite3 Connection Object + pr : str + pr number acting as key for removing the row with in it + + """ + + obj.execute(f'DELETE FROM processing WHERE pr = {pr}').rowcount + + +def input_args(): + + description = """Arguments for creating and updating db file for pr states + """ + + parser = ArgumentParser(description=description, + formatter_class=ArgumentDefaultsHelpFormatter) + + parser.add_argument('sbfile', help='SQLite3 database file with PR list', type=str) + parser.add_argument('--create', help='create sqlite file for pr list status', action='store_true', required=False) + parser.add_argument('--add_pr', nargs=1, metavar='PR', help='add new pr to list (defults to: Open,Ready)', required=False) + parser.add_argument('--remove_pr', nargs=1, metavar='PR', help='removes pr from list', required=False) + parser.add_argument('--update_pr', nargs=3, metavar=('pr', 'state', 'status'), help='updates state and status of a given pr', required=False) + parser.add_argument('--display', help='output pr table', action='store_true', required=False) + + args = parser.parse_args() + return args + + +if __name__ == '__main__': + + args = input_args() + + con = sql_connection(args.sbfile) + obj = con.cursor() + + if args.create: + sql_table(obj) + + if args.add_pr: + rows = sql_fetch(obj) + for row in rows: + if str(row[0]) == str(args.add_pr[0]): + print(f"pr {row[0]} already is in list: nothing added") + sys.exit(0) + + entities = (args.add_pr[0], 'Open', 'Ready') + sql_insert(obj, entities) + + if args.update_pr: + pr = args.update_pr[0] + state = args.update_pr[1] + status = args.update_pr[2] + sql_update(obj, pr, state, status) + + if args.remove_pr: + sql_remove(obj, args.remove_pr[0]) + + if args.display: + rows = sql_fetch(obj) + for row in rows: + print(' '.join(map(str, row))) + + con.commit() + con.close() diff --git a/ci/scripts/run_ci.sh b/ci/scripts/run_ci.sh index c79ea06e77e11cd000109854e28a2dbe1f1fd5d5..8a1a363d32821eed54cbfdbe6f4130d69bc897f7 100755 --- a/ci/scripts/run_ci.sh +++ b/ci/scripts/run_ci.sh @@ -43,24 +43,41 @@ else exit 1 fi -pr_list_file="open_pr_list" +pr_list_dbfile="${GFS_CI_ROOT}/open_pr_list.db" -if [[ -s "${GFS_CI_ROOT}/${pr_list_file}" ]]; then - pr_list=$(cat "${GFS_CI_ROOT}/${pr_list_file}") -else - echo "no PRs to process .. exit" +pr_list="" +if [[ -f "${pr_list_dbfile}" ]]; then + pr_list=$("${HOMEgfs}/ci/scripts/pr_list_database.py" --display "${pr_list_dbfile}" | grep -v Failed | grep Open | grep Running | awk '{print $1}' | head -"${max_concurrent_pr}") || true +fi +if [[ -z "${pr_list}" ]]; then + echo "no PRs open and ready for checkout/build .. exiting" exit 0 fi ############################################################# # Loop throu all PRs in PR List and look for expirments in # the RUNTESTS dir and for each one run runcotorun on them +# only up to $max_concurrent_cases will advance at a time ############################################################# for pr in ${pr_list}; do echo "Processing Pull Request #${pr} and looking for cases" pr_dir="${GFS_CI_ROOT}/PR/${pr}" + # If the directory RUNTESTS is not present then + # setupexpt.py has no been run yet for this PR + if [[ ! -d "${pr_dir}/RUNTESTS" ]]; then + continue + fi + num_cases=0 for cases in "${pr_dir}/RUNTESTS/"*; do + if [[ ! -d "${cases}" ]]; then + continue + fi + ((num_cases=num_cases+1)) + # No more than two cases are going forward at a time for each PR + if [[ "${num_cases}" -gt "${max_concurrent_cases}" ]]; then + continue + fi pslot=$(basename "${cases}") xml="${pr_dir}/RUNTESTS/${pslot}/EXPDIR/${pslot}/${pslot}.xml" db="${pr_dir}/RUNTESTS/${pslot}/EXPDIR/${pslot}/${pslot}.db" @@ -68,4 +85,3 @@ for pr in ${pr_list}; do "${rocotorun}" -v 10 -w "${xml}" -d "${db}" done done - diff --git a/modulefiles/module_gwci.orion.lua b/modulefiles/module_gwci.orion.lua index 779e80a45438b113c07a0390db77bfe77d118c8c..971ba01c655e2d0b9303668a0e2593507dbeb87a 100644 --- a/modulefiles/module_gwci.orion.lua +++ b/modulefiles/module_gwci.orion.lua @@ -8,6 +8,7 @@ load(pathJoin("hpc", "1.1.0")) load(pathJoin("hpc-intel", "2018.4")) load(pathJoin("hpc-impi", "2018.4")) load(pathJoin("netcdf","4.7.4")) +load(pathJoin("netcdf","4.7.4-parallel")) load(pathJoin("nccmp"," 1.8.7.0")) load(pathJoin("contrib","0.1")) load(pathJoin("wgrib2","3.0.2"))