gitlab-ci: refactor timeout constants and tweak timeout values

* Refactor timeouts and retry attempts constants to variables in the top
  of the python script.

* Increase LAVA job timeout value from 1 minute to 5 minutes, since the
  timeout detection is just a heuristic based on the log silence in LAVA
  devices. If we keep 1 minute timeout, maybe we could cancel jobs that
  have tasks which may take too long to respond. Also, one minute
  timeout is prone to misdetect scenarios when some network errors or
  slowness may happen.

* Increase polling rate to check if the job has started from 1 check
  every 30 seconds to 1 check every 10 seconds. Since it was taking 30
  seconds in the worst case to start to get the log output from a LAVA
  job. It is important to note that some LAVA jobs take less than 2
  minutes to finish, so a 10 second wait would be more suitable in those
  cases.

Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com>
Reviewed-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12870>
This commit is contained in:
Guilherme Gallo 2021-09-15 11:03:16 -03:00 committed by Marge Bot
parent 4655196796
commit 7244aa1980
1 changed files with 19 additions and 5 deletions

View File

@ -37,6 +37,20 @@ import yaml
from datetime import datetime, timedelta
from lavacli.utils import loader
# Timeout in minutes to decide if the device from the dispatched LAVA job has
# hung or not due to the lack of new log output.
DEVICE_HANGING_TIMEOUT_MIN = 5
# How many seconds the script should wait before try a new polling iteration to
# check if the dispatched LAVA job is running or waiting in the job queue.
WAIT_FOR_DEVICE_POLLING_TIME_SEC = 10
# How many seconds to wait between log output LAVA RPC calls.
LOG_POLLING_TIME_SEC = 5
# How many retries should be made when a timeout happen.
NUMBER_OF_RETRIES_TIMEOUT_DETECTION = 2
def print_log(msg):
print("{}: {}".format(datetime.now(), msg))
@ -112,7 +126,7 @@ def generate_lava_yaml(args):
'format': 'Lava-Test Test Definition 1.0',
},
'parse': {
'pattern': 'hwci: (?P<test_case_id>\S*):\s+(?P<result>(pass|fail))'
'pattern': r'hwci: (?P<test_case_id>\S*):\s+(?P<result>(pass|fail))'
},
'run': {
},
@ -218,7 +232,7 @@ def wait_until_job_is_started(proxy, job_id):
job_state = _call_proxy(proxy.scheduler.job_state, job_id)
current_state = job_state["job_state"]
time.sleep(30)
time.sleep(WAIT_FOR_DEVICE_POLLING_TIME_SEC)
print_log(f"Job {job_id} started.")
def follow_job_execution(proxy, job_id):
@ -237,7 +251,7 @@ def follow_job_execution(proxy, job_id):
line_count += len(logs)
else:
time_limit = timedelta(minutes=1)
time_limit = timedelta(minutes=DEVICE_HANGING_TIMEOUT_MIN)
if datetime.now() - last_time_logs > time_limit:
print_log("LAVA job {} doesn't advance (machine got hung?). Retry.".format(job_id))
return False
@ -245,7 +259,7 @@ def follow_job_execution(proxy, job_id):
# `proxy.scheduler.jobs.logs` does not block, even when there is no
# new log to be fetched. To avoid dosing the LAVA dispatcher
# machine, let's add a sleep to save them some stamina.
time.sleep(5)
time.sleep(LOG_POLLING_TIME_SEC)
return True
@ -282,7 +296,7 @@ def main(args):
print("LAVA job definition validated successfully")
return
retry_count = 2
retry_count = NUMBER_OF_RETRIES_TIMEOUT_DETECTION
while retry_count >= 0:
job_id = submit_job(proxy, yaml_file)