ci: Make LAVA jobs fail CI job when retry is exhausted
When the lava_job_submitter.py retry loop finishes normally (without falling through break-loop) it means that the submitter has exceeded the retry count limit. However, when it happens the script finishes normally. This patch adds a treatment to this case, warning the user what happened and forcing the job to fail. Moreover, this commit will make retry configurations configurable by CI job, as it can take the default value from the following variables: - LAVA_DEVICE_HANGING_TIMEOUT_SEC - LAVA_WAIT_FOR_DEVICE_POLLING_TIME_SEC - LAVA_LOG_POLLING_TIME_SEC - LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14876>
This commit is contained in:
parent
df0e2a1565
commit
addac10443
|
@ -31,25 +31,27 @@ import time
|
|||
import traceback
|
||||
import urllib.parse
|
||||
import xmlrpc
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from os import getenv
|
||||
|
||||
import lavacli
|
||||
import yaml
|
||||
from lavacli.utils import loader
|
||||
|
||||
# Timeout in minutes to decide if the device from the dispatched LAVA job has
|
||||
# Timeout in seconds to decide if the device from the dispatched LAVA job has
|
||||
# hung or not due to the lack of new log output.
|
||||
DEVICE_HANGING_TIMEOUT_MIN = 5
|
||||
DEVICE_HANGING_TIMEOUT_SEC = int(getenv("LAVA_DEVICE_HANGING_TIMEOUT_SEC", 5*60))
|
||||
|
||||
# How many seconds the script should wait before try a new polling iteration to
|
||||
# check if the dispatched LAVA job is running or waiting in the job queue.
|
||||
WAIT_FOR_DEVICE_POLLING_TIME_SEC = 10
|
||||
WAIT_FOR_DEVICE_POLLING_TIME_SEC = int(getenv("LAVA_WAIT_FOR_DEVICE_POLLING_TIME_SEC", 10))
|
||||
|
||||
# How many seconds to wait between log output LAVA RPC calls.
|
||||
LOG_POLLING_TIME_SEC = 5
|
||||
LOG_POLLING_TIME_SEC = int(getenv("LAVA_LOG_POLLING_TIME_SEC", 5))
|
||||
|
||||
# How many retries should be made when a timeout happen.
|
||||
NUMBER_OF_RETRIES_TIMEOUT_DETECTION = 2
|
||||
NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2))
|
||||
|
||||
|
||||
def print_log(msg):
|
||||
|
@ -61,14 +63,7 @@ def fatal_err(msg):
|
|||
|
||||
|
||||
def hide_sensitive_data(yaml_data, hide_tag="HIDEME"):
|
||||
out_data = ""
|
||||
|
||||
for line in yaml_data.splitlines(True):
|
||||
if hide_tag in line:
|
||||
continue
|
||||
out_data += line
|
||||
|
||||
return out_data
|
||||
return "".join(line for line in yaml_data.splitlines(True) if hide_tag not in line)
|
||||
|
||||
|
||||
def generate_lava_yaml(args):
|
||||
|
@ -211,7 +206,6 @@ def _call_proxy(fn, *args):
|
|||
fatal_err("A protocol error occurred (Err {} {})".format(err.errcode, err.errmsg))
|
||||
else:
|
||||
time.sleep(15)
|
||||
pass
|
||||
except xmlrpc.client.Fault as err:
|
||||
traceback.print_exc()
|
||||
fatal_err("FATAL: Fault: {} (code: {})".format(err.faultString, err.faultCode))
|
||||
|
@ -222,8 +216,8 @@ def get_job_results(proxy, job_id, test_suite, test_case):
|
|||
results_yaml = _call_proxy(proxy.results.get_testjob_results_yaml, job_id)
|
||||
results = yaml.load(results_yaml, Loader=loader(False))
|
||||
for res in results:
|
||||
metadata = res['metadata']
|
||||
if not 'result' in metadata or metadata['result'] != 'fail':
|
||||
metadata = res["metadata"]
|
||||
if "result" not in metadata or metadata["result"] != "fail":
|
||||
continue
|
||||
if 'error_type' in metadata and metadata['error_type'] == "Infrastructure":
|
||||
print_log("LAVA job {} failed with Infrastructure Error. Retry.".format(job_id))
|
||||
|
@ -260,8 +254,7 @@ def follow_job_execution(proxy, job_id):
|
|||
last_time_logs = datetime.now()
|
||||
while not finished:
|
||||
(finished, data) = _call_proxy(proxy.scheduler.jobs.logs, job_id, line_count)
|
||||
logs = yaml.load(str(data), Loader=loader(False))
|
||||
if logs:
|
||||
if logs := yaml.load(str(data), Loader=loader(False)):
|
||||
# Reset the timeout
|
||||
last_time_logs = datetime.now()
|
||||
for line in logs:
|
||||
|
@ -270,7 +263,7 @@ def follow_job_execution(proxy, job_id):
|
|||
line_count += len(logs)
|
||||
|
||||
else:
|
||||
time_limit = timedelta(minutes=DEVICE_HANGING_TIMEOUT_MIN)
|
||||
time_limit = timedelta(seconds=DEVICE_HANGING_TIMEOUT_SEC)
|
||||
if datetime.now() - last_time_logs > time_limit:
|
||||
print_log("LAVA job {} doesn't advance (machine got hung?). Retry.".format(job_id))
|
||||
return False
|
||||
|
@ -298,21 +291,7 @@ def submit_job(proxy, job_file):
|
|||
return _call_proxy(proxy.scheduler.jobs.submit, job_file)
|
||||
|
||||
|
||||
def main(args):
|
||||
proxy = setup_lava_proxy()
|
||||
|
||||
yaml_file = generate_lava_yaml(args)
|
||||
|
||||
if args.dump_yaml:
|
||||
print(hide_sensitive_data(generate_lava_yaml(args)))
|
||||
|
||||
if args.validate_only:
|
||||
ret = validate_job(proxy, yaml_file)
|
||||
if not ret:
|
||||
fatal_err("Error in LAVA job definition")
|
||||
print("LAVA job definition validated successfully")
|
||||
return
|
||||
|
||||
def retriable_follow_job(proxy, yaml_file):
|
||||
retry_count = NUMBER_OF_RETRIES_TIMEOUT_DETECTION
|
||||
|
||||
while retry_count >= 0:
|
||||
|
@ -332,8 +311,36 @@ def main(args):
|
|||
|
||||
show_job_data(proxy, job_id)
|
||||
|
||||
if get_job_results(proxy, job_id, "0_mesa", "mesa") == True:
|
||||
break
|
||||
if get_job_results(proxy, job_id, "0_mesa", "mesa") == True:
|
||||
break
|
||||
else:
|
||||
# The script attempted all the retries. The job seemed to fail.
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def main(args):
|
||||
proxy = setup_lava_proxy()
|
||||
|
||||
yaml_file = generate_lava_yaml(args)
|
||||
|
||||
if args.dump_yaml:
|
||||
print(hide_sensitive_data(generate_lava_yaml(args)))
|
||||
|
||||
if args.validate_only:
|
||||
ret = validate_job(proxy, yaml_file)
|
||||
if not ret:
|
||||
fatal_err("Error in LAVA job definition")
|
||||
print("LAVA job definition validated successfully")
|
||||
return
|
||||
|
||||
if not retriable_follow_job(proxy, yaml_file):
|
||||
fatal_err(
|
||||
"Job failed after it exceeded the number of"
|
||||
f"{NUMBER_OF_RETRIES_TIMEOUT_DETECTION} retries."
|
||||
)
|
||||
|
||||
|
||||
def create_parser():
|
||||
parser = argparse.ArgumentParser("LAVA job submitter")
|
||||
|
|
Loading…
Reference in New Issue