ci: Make LAVA jobs fail CI job when retry is exhausted

When the lava_job_submitter.py retry loop finishes normally (without
falling through break-loop) it means that the submitter has exceeded the
retry count limit. However, when it happens the script
finishes normally. This patch adds a treatment to this case, warning the
user what happened and forcing the job to fail.

Moreover, this commit will make retry configurations configurable by
CI job, as it can take the default value from the following variables:

- LAVA_DEVICE_HANGING_TIMEOUT_SEC
- LAVA_WAIT_FOR_DEVICE_POLLING_TIME_SEC
- LAVA_LOG_POLLING_TIME_SEC
- LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION

Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14876>
This commit is contained in:
Guilherme Gallo 2022-02-16 01:07:57 -03:00 committed by Marge Bot
parent df0e2a1565
commit addac10443
1 changed files with 43 additions and 36 deletions

View File

@ -31,25 +31,27 @@ import time
import traceback
import urllib.parse
import xmlrpc
from datetime import datetime, timedelta
from os import getenv
import lavacli
import yaml
from lavacli.utils import loader
# Timeout in minutes to decide if the device from the dispatched LAVA job has
# Timeout in seconds to decide if the device from the dispatched LAVA job has
# hung or not due to the lack of new log output.
DEVICE_HANGING_TIMEOUT_MIN = 5
DEVICE_HANGING_TIMEOUT_SEC = int(getenv("LAVA_DEVICE_HANGING_TIMEOUT_SEC", 5*60))
# How many seconds the script should wait before try a new polling iteration to
# check if the dispatched LAVA job is running or waiting in the job queue.
WAIT_FOR_DEVICE_POLLING_TIME_SEC = 10
WAIT_FOR_DEVICE_POLLING_TIME_SEC = int(getenv("LAVA_WAIT_FOR_DEVICE_POLLING_TIME_SEC", 10))
# How many seconds to wait between log output LAVA RPC calls.
LOG_POLLING_TIME_SEC = 5
LOG_POLLING_TIME_SEC = int(getenv("LAVA_LOG_POLLING_TIME_SEC", 5))
# How many retries should be made when a timeout happen.
NUMBER_OF_RETRIES_TIMEOUT_DETECTION = 2
NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2))
def print_log(msg):
@ -61,14 +63,7 @@ def fatal_err(msg):
def hide_sensitive_data(yaml_data, hide_tag="HIDEME"):
out_data = ""
for line in yaml_data.splitlines(True):
if hide_tag in line:
continue
out_data += line
return out_data
return "".join(line for line in yaml_data.splitlines(True) if hide_tag not in line)
def generate_lava_yaml(args):
@ -211,7 +206,6 @@ def _call_proxy(fn, *args):
fatal_err("A protocol error occurred (Err {} {})".format(err.errcode, err.errmsg))
else:
time.sleep(15)
pass
except xmlrpc.client.Fault as err:
traceback.print_exc()
fatal_err("FATAL: Fault: {} (code: {})".format(err.faultString, err.faultCode))
@ -222,8 +216,8 @@ def get_job_results(proxy, job_id, test_suite, test_case):
results_yaml = _call_proxy(proxy.results.get_testjob_results_yaml, job_id)
results = yaml.load(results_yaml, Loader=loader(False))
for res in results:
metadata = res['metadata']
if not 'result' in metadata or metadata['result'] != 'fail':
metadata = res["metadata"]
if "result" not in metadata or metadata["result"] != "fail":
continue
if 'error_type' in metadata and metadata['error_type'] == "Infrastructure":
print_log("LAVA job {} failed with Infrastructure Error. Retry.".format(job_id))
@ -260,8 +254,7 @@ def follow_job_execution(proxy, job_id):
last_time_logs = datetime.now()
while not finished:
(finished, data) = _call_proxy(proxy.scheduler.jobs.logs, job_id, line_count)
logs = yaml.load(str(data), Loader=loader(False))
if logs:
if logs := yaml.load(str(data), Loader=loader(False)):
# Reset the timeout
last_time_logs = datetime.now()
for line in logs:
@ -270,7 +263,7 @@ def follow_job_execution(proxy, job_id):
line_count += len(logs)
else:
time_limit = timedelta(minutes=DEVICE_HANGING_TIMEOUT_MIN)
time_limit = timedelta(seconds=DEVICE_HANGING_TIMEOUT_SEC)
if datetime.now() - last_time_logs > time_limit:
print_log("LAVA job {} doesn't advance (machine got hung?). Retry.".format(job_id))
return False
@ -298,21 +291,7 @@ def submit_job(proxy, job_file):
return _call_proxy(proxy.scheduler.jobs.submit, job_file)
def main(args):
proxy = setup_lava_proxy()
yaml_file = generate_lava_yaml(args)
if args.dump_yaml:
print(hide_sensitive_data(generate_lava_yaml(args)))
if args.validate_only:
ret = validate_job(proxy, yaml_file)
if not ret:
fatal_err("Error in LAVA job definition")
print("LAVA job definition validated successfully")
return
def retriable_follow_job(proxy, yaml_file):
retry_count = NUMBER_OF_RETRIES_TIMEOUT_DETECTION
while retry_count >= 0:
@ -332,8 +311,36 @@ def main(args):
show_job_data(proxy, job_id)
if get_job_results(proxy, job_id, "0_mesa", "mesa") == True:
break
if get_job_results(proxy, job_id, "0_mesa", "mesa") == True:
break
else:
# The script attempted all the retries. The job seemed to fail.
return False
return True
def main(args):
proxy = setup_lava_proxy()
yaml_file = generate_lava_yaml(args)
if args.dump_yaml:
print(hide_sensitive_data(generate_lava_yaml(args)))
if args.validate_only:
ret = validate_job(proxy, yaml_file)
if not ret:
fatal_err("Error in LAVA job definition")
print("LAVA job definition validated successfully")
return
if not retriable_follow_job(proxy, yaml_file):
fatal_err(
"Job failed after it exceeded the number of"
f"{NUMBER_OF_RETRIES_TIMEOUT_DETECTION} retries."
)
def create_parser():
parser = argparse.ArgumentParser("LAVA job submitter")