ci: Make LAVA jobs fail CI job when retry is exhausted

When the lava_job_submitter.py retry loop finishes normally (without falling through break-loop) it means that the submitter has exceeded the retry count limit. However, when it happens the script finishes normally. This patch adds a treatment to this case, warning the user what happened and forcing the job to fail. Moreover, this commit will make retry configurations configurable by CI job, as it can take the default value from the following variables: - LAVA_DEVICE_HANGING_TIMEOUT_SEC - LAVA_WAIT_FOR_DEVICE_POLLING_TIME_SEC - LAVA_LOG_POLLING_TIME_SEC - LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14876>
2022-02-16 01:07:57 -03:00 · 2022-02-16 01:07:57 -03:00 · addac10443
parent df0e2a1565
commit addac10443
1 changed files with 43 additions and 36 deletions
--- a/.gitlab-ci/lava/lava_job_submitter.py
+++ b/.gitlab-ci/lava/lava_job_submitter.py
@ -31,25 +31,27 @@ import time
 import traceback
 import urllib.parse
 import xmlrpc
+
 from datetime import datetime, timedelta
+from os import getenv

 import lavacli
 import yaml
 from lavacli.utils import loader

-# Timeout in minutes to decide if the device from the dispatched LAVA job has
+# Timeout in seconds to decide if the device from the dispatched LAVA job has
 # hung or not due to the lack of new log output.
-DEVICE_HANGING_TIMEOUT_MIN = 5
+DEVICE_HANGING_TIMEOUT_SEC = int(getenv("LAVA_DEVICE_HANGING_TIMEOUT_SEC",  5*60))

 # How many seconds the script should wait before try a new polling iteration to
 # check if the dispatched LAVA job is running or waiting in the job queue.
-WAIT_FOR_DEVICE_POLLING_TIME_SEC = 10
+WAIT_FOR_DEVICE_POLLING_TIME_SEC = int(getenv("LAVA_WAIT_FOR_DEVICE_POLLING_TIME_SEC", 10))

 # How many seconds to wait between log output LAVA RPC calls.
-LOG_POLLING_TIME_SEC = 5
+LOG_POLLING_TIME_SEC = int(getenv("LAVA_LOG_POLLING_TIME_SEC", 5))

 # How many retries should be made when a timeout happen.
-NUMBER_OF_RETRIES_TIMEOUT_DETECTION = 2
+NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2))


 def print_log(msg):
@ -61,14 +63,7 @@ def fatal_err(msg):


 def hide_sensitive_data(yaml_data, hide_tag="HIDEME"):
-    out_data = ""
-
-    for line in yaml_data.splitlines(True):
-        if hide_tag in line:
-            continue
-        out_data += line
-
-    return out_data
+    return "".join(line for line in yaml_data.splitlines(True) if hide_tag not in line)


 def generate_lava_yaml(args):
@ -211,7 +206,6 @@ def _call_proxy(fn, *args):
                fatal_err("A protocol error occurred (Err {} {})".format(err.errcode, err.errmsg))
            else:
                time.sleep(15)
-                pass
        except xmlrpc.client.Fault as err:
            traceback.print_exc()
            fatal_err("FATAL: Fault: {} (code: {})".format(err.faultString, err.faultCode))
@ -222,8 +216,8 @@ def get_job_results(proxy, job_id, test_suite, test_case):
    results_yaml = _call_proxy(proxy.results.get_testjob_results_yaml, job_id)
    results = yaml.load(results_yaml, Loader=loader(False))
    for res in results:
-        metadata = res['metadata']
-        if not 'result' in metadata or metadata['result'] != 'fail':
+        metadata = res["metadata"]
+        if "result" not in metadata or metadata["result"] != "fail":
            continue
        if 'error_type' in metadata and metadata['error_type'] == "Infrastructure":
            print_log("LAVA job {} failed with Infrastructure Error. Retry.".format(job_id))
@ -260,8 +254,7 @@ def follow_job_execution(proxy, job_id):
    last_time_logs = datetime.now()
    while not finished:
        (finished, data) = _call_proxy(proxy.scheduler.jobs.logs, job_id, line_count)
-        logs = yaml.load(str(data), Loader=loader(False))
-        if logs:
+        if logs := yaml.load(str(data), Loader=loader(False)):
            # Reset the timeout
            last_time_logs = datetime.now()
            for line in logs:
@ -270,7 +263,7 @@ def follow_job_execution(proxy, job_id):
            line_count += len(logs)

        else:
-            time_limit = timedelta(minutes=DEVICE_HANGING_TIMEOUT_MIN)
+            time_limit = timedelta(seconds=DEVICE_HANGING_TIMEOUT_SEC)
            if datetime.now() - last_time_logs > time_limit:
                print_log("LAVA job {} doesn't advance (machine got hung?). Retry.".format(job_id))
                return False
@ -298,21 +291,7 @@ def submit_job(proxy, job_file):
    return _call_proxy(proxy.scheduler.jobs.submit, job_file)


-def main(args):
-    proxy = setup_lava_proxy()
-
-    yaml_file = generate_lava_yaml(args)
-
-    if args.dump_yaml:
-        print(hide_sensitive_data(generate_lava_yaml(args)))
-
-    if args.validate_only:
-        ret = validate_job(proxy, yaml_file)
-        if not ret:
-            fatal_err("Error in LAVA job definition")
-        print("LAVA job definition validated successfully")
-        return
-
+def retriable_follow_job(proxy, yaml_file):
    retry_count = NUMBER_OF_RETRIES_TIMEOUT_DETECTION

    while retry_count >= 0:
@ -332,8 +311,36 @@ def main(args):

        show_job_data(proxy, job_id)

-        if get_job_results(proxy,  job_id, "0_mesa", "mesa") == True:
-             break
+        if get_job_results(proxy, job_id, "0_mesa", "mesa") == True:
+            break
+    else:
+        # The script attempted all the retries. The job seemed to fail.
+        return False
+
+    return True
+
+
+def main(args):
+    proxy = setup_lava_proxy()
+
+    yaml_file = generate_lava_yaml(args)
+
+    if args.dump_yaml:
+        print(hide_sensitive_data(generate_lava_yaml(args)))
+
+    if args.validate_only:
+        ret = validate_job(proxy, yaml_file)
+        if not ret:
+            fatal_err("Error in LAVA job definition")
+        print("LAVA job definition validated successfully")
+        return
+
+    if not retriable_follow_job(proxy, yaml_file):
+        fatal_err(
+            "Job failed after it exceeded the number of"
+            f"{NUMBER_OF_RETRIES_TIMEOUT_DETECTION} retries."
+        )
+

 def create_parser():
    parser = argparse.ArgumentParser("LAVA job submitter")