ci: Make LAVA jobs fail CI job when retry is exhausted

When the lava_job_submitter.py retry loop finishes normally (without
falling through break-loop) it means that the submitter has exceeded the
retry count limit. However, when it happens the script
finishes normally. This patch adds a treatment to this case, warning the
user what happened and forcing the job to fail.

Moreover, this commit will make retry configurations configurable by
CI job, as it can take the default value from the following variables:

- LAVA_DEVICE_HANGING_TIMEOUT_SEC
- LAVA_WAIT_FOR_DEVICE_POLLING_TIME_SEC
- LAVA_LOG_POLLING_TIME_SEC
- LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION

Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14876>
This commit is contained in:
Guilherme Gallo 2022-02-16 01:07:57 -03:00 committed by Marge Bot
parent df0e2a1565
commit addac10443
1 changed files with 43 additions and 36 deletions

View File

@ -31,25 +31,27 @@ import time
import traceback import traceback
import urllib.parse import urllib.parse
import xmlrpc import xmlrpc
from datetime import datetime, timedelta from datetime import datetime, timedelta
from os import getenv
import lavacli import lavacli
import yaml import yaml
from lavacli.utils import loader from lavacli.utils import loader
# Timeout in minutes to decide if the device from the dispatched LAVA job has # Timeout in seconds to decide if the device from the dispatched LAVA job has
# hung or not due to the lack of new log output. # hung or not due to the lack of new log output.
DEVICE_HANGING_TIMEOUT_MIN = 5 DEVICE_HANGING_TIMEOUT_SEC = int(getenv("LAVA_DEVICE_HANGING_TIMEOUT_SEC", 5*60))
# How many seconds the script should wait before try a new polling iteration to # How many seconds the script should wait before try a new polling iteration to
# check if the dispatched LAVA job is running or waiting in the job queue. # check if the dispatched LAVA job is running or waiting in the job queue.
WAIT_FOR_DEVICE_POLLING_TIME_SEC = 10 WAIT_FOR_DEVICE_POLLING_TIME_SEC = int(getenv("LAVA_WAIT_FOR_DEVICE_POLLING_TIME_SEC", 10))
# How many seconds to wait between log output LAVA RPC calls. # How many seconds to wait between log output LAVA RPC calls.
LOG_POLLING_TIME_SEC = 5 LOG_POLLING_TIME_SEC = int(getenv("LAVA_LOG_POLLING_TIME_SEC", 5))
# How many retries should be made when a timeout happen. # How many retries should be made when a timeout happen.
NUMBER_OF_RETRIES_TIMEOUT_DETECTION = 2 NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2))
def print_log(msg): def print_log(msg):
@ -61,14 +63,7 @@ def fatal_err(msg):
def hide_sensitive_data(yaml_data, hide_tag="HIDEME"): def hide_sensitive_data(yaml_data, hide_tag="HIDEME"):
out_data = "" return "".join(line for line in yaml_data.splitlines(True) if hide_tag not in line)
for line in yaml_data.splitlines(True):
if hide_tag in line:
continue
out_data += line
return out_data
def generate_lava_yaml(args): def generate_lava_yaml(args):
@ -211,7 +206,6 @@ def _call_proxy(fn, *args):
fatal_err("A protocol error occurred (Err {} {})".format(err.errcode, err.errmsg)) fatal_err("A protocol error occurred (Err {} {})".format(err.errcode, err.errmsg))
else: else:
time.sleep(15) time.sleep(15)
pass
except xmlrpc.client.Fault as err: except xmlrpc.client.Fault as err:
traceback.print_exc() traceback.print_exc()
fatal_err("FATAL: Fault: {} (code: {})".format(err.faultString, err.faultCode)) fatal_err("FATAL: Fault: {} (code: {})".format(err.faultString, err.faultCode))
@ -222,8 +216,8 @@ def get_job_results(proxy, job_id, test_suite, test_case):
results_yaml = _call_proxy(proxy.results.get_testjob_results_yaml, job_id) results_yaml = _call_proxy(proxy.results.get_testjob_results_yaml, job_id)
results = yaml.load(results_yaml, Loader=loader(False)) results = yaml.load(results_yaml, Loader=loader(False))
for res in results: for res in results:
metadata = res['metadata'] metadata = res["metadata"]
if not 'result' in metadata or metadata['result'] != 'fail': if "result" not in metadata or metadata["result"] != "fail":
continue continue
if 'error_type' in metadata and metadata['error_type'] == "Infrastructure": if 'error_type' in metadata and metadata['error_type'] == "Infrastructure":
print_log("LAVA job {} failed with Infrastructure Error. Retry.".format(job_id)) print_log("LAVA job {} failed with Infrastructure Error. Retry.".format(job_id))
@ -260,8 +254,7 @@ def follow_job_execution(proxy, job_id):
last_time_logs = datetime.now() last_time_logs = datetime.now()
while not finished: while not finished:
(finished, data) = _call_proxy(proxy.scheduler.jobs.logs, job_id, line_count) (finished, data) = _call_proxy(proxy.scheduler.jobs.logs, job_id, line_count)
logs = yaml.load(str(data), Loader=loader(False)) if logs := yaml.load(str(data), Loader=loader(False)):
if logs:
# Reset the timeout # Reset the timeout
last_time_logs = datetime.now() last_time_logs = datetime.now()
for line in logs: for line in logs:
@ -270,7 +263,7 @@ def follow_job_execution(proxy, job_id):
line_count += len(logs) line_count += len(logs)
else: else:
time_limit = timedelta(minutes=DEVICE_HANGING_TIMEOUT_MIN) time_limit = timedelta(seconds=DEVICE_HANGING_TIMEOUT_SEC)
if datetime.now() - last_time_logs > time_limit: if datetime.now() - last_time_logs > time_limit:
print_log("LAVA job {} doesn't advance (machine got hung?). Retry.".format(job_id)) print_log("LAVA job {} doesn't advance (machine got hung?). Retry.".format(job_id))
return False return False
@ -298,21 +291,7 @@ def submit_job(proxy, job_file):
return _call_proxy(proxy.scheduler.jobs.submit, job_file) return _call_proxy(proxy.scheduler.jobs.submit, job_file)
def main(args): def retriable_follow_job(proxy, yaml_file):
proxy = setup_lava_proxy()
yaml_file = generate_lava_yaml(args)
if args.dump_yaml:
print(hide_sensitive_data(generate_lava_yaml(args)))
if args.validate_only:
ret = validate_job(proxy, yaml_file)
if not ret:
fatal_err("Error in LAVA job definition")
print("LAVA job definition validated successfully")
return
retry_count = NUMBER_OF_RETRIES_TIMEOUT_DETECTION retry_count = NUMBER_OF_RETRIES_TIMEOUT_DETECTION
while retry_count >= 0: while retry_count >= 0:
@ -332,8 +311,36 @@ def main(args):
show_job_data(proxy, job_id) show_job_data(proxy, job_id)
if get_job_results(proxy, job_id, "0_mesa", "mesa") == True: if get_job_results(proxy, job_id, "0_mesa", "mesa") == True:
break break
else:
# The script attempted all the retries. The job seemed to fail.
return False
return True
def main(args):
proxy = setup_lava_proxy()
yaml_file = generate_lava_yaml(args)
if args.dump_yaml:
print(hide_sensitive_data(generate_lava_yaml(args)))
if args.validate_only:
ret = validate_job(proxy, yaml_file)
if not ret:
fatal_err("Error in LAVA job definition")
print("LAVA job definition validated successfully")
return
if not retriable_follow_job(proxy, yaml_file):
fatal_err(
"Job failed after it exceeded the number of"
f"{NUMBER_OF_RETRIES_TIMEOUT_DETECTION} retries."
)
def create_parser(): def create_parser():
parser = argparse.ArgumentParser("LAVA job submitter") parser = argparse.ArgumentParser("LAVA job submitter")