381 lines
13 KiB
Python
Executable File
381 lines
13 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
#
|
|
# Copyright (C) 2020, 2021 Collabora Limited
|
|
# Author: Gustavo Padovan <gustavo.padovan@collabora.com>
|
|
#
|
|
# Permission is hereby granted, free of charge, to any person obtaining a
|
|
# copy of this software and associated documentation files (the "Software"),
|
|
# to deal in the Software without restriction, including without limitation
|
|
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
# and/or sell copies of the Software, and to permit persons to whom the
|
|
# Software is furnished to do so, subject to the following conditions:
|
|
#
|
|
# The above copyright notice and this permission notice (including the next
|
|
# paragraph) shall be included in all copies or substantial portions of the
|
|
# Software.
|
|
#
|
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
# SOFTWARE.
|
|
|
|
"""Send a job to LAVA, track it and collect log back"""
|
|
|
|
import argparse
|
|
import pathlib
|
|
import sys
|
|
import time
|
|
import traceback
|
|
import urllib.parse
|
|
import xmlrpc
|
|
|
|
from datetime import datetime, timedelta
|
|
from os import getenv
|
|
|
|
import lavacli
|
|
import yaml
|
|
from lavacli.utils import loader
|
|
|
|
# Timeout in seconds to decide if the device from the dispatched LAVA job has
|
|
# hung or not due to the lack of new log output.
|
|
DEVICE_HANGING_TIMEOUT_SEC = int(getenv("LAVA_DEVICE_HANGING_TIMEOUT_SEC", 5*60))
|
|
|
|
# How many seconds the script should wait before try a new polling iteration to
|
|
# check if the dispatched LAVA job is running or waiting in the job queue.
|
|
WAIT_FOR_DEVICE_POLLING_TIME_SEC = int(getenv("LAVA_WAIT_FOR_DEVICE_POLLING_TIME_SEC", 10))
|
|
|
|
# How many seconds to wait between log output LAVA RPC calls.
|
|
LOG_POLLING_TIME_SEC = int(getenv("LAVA_LOG_POLLING_TIME_SEC", 5))
|
|
|
|
# How many retries should be made when a timeout happen.
|
|
NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2))
|
|
|
|
|
|
def print_log(msg):
|
|
print("{}: {}".format(datetime.now(), msg))
|
|
|
|
def fatal_err(msg):
|
|
print_log(msg)
|
|
sys.exit(1)
|
|
|
|
|
|
def hide_sensitive_data(yaml_data, hide_tag="HIDEME"):
|
|
return "".join(line for line in yaml_data.splitlines(True) if hide_tag not in line)
|
|
|
|
|
|
def generate_lava_yaml(args):
|
|
# General metadata and permissions, plus also inexplicably kernel arguments
|
|
values = {
|
|
'job_name': 'mesa: {}'.format(args.pipeline_info),
|
|
'device_type': args.device_type,
|
|
'visibility': { 'group': [ args.visibility_group ] },
|
|
'priority': 75,
|
|
'context': {
|
|
'extra_nfsroot_args': ' init=/init rootwait minio_results={}'.format(args.job_artifacts_base)
|
|
},
|
|
'timeouts': {
|
|
'job': {
|
|
'minutes': args.job_timeout
|
|
}
|
|
},
|
|
}
|
|
|
|
if args.lava_tags:
|
|
values['tags'] = args.lava_tags.split(',')
|
|
|
|
# URLs to our kernel rootfs to boot from, both generated by the base
|
|
# container build
|
|
deploy = {
|
|
'timeout': { 'minutes': 10 },
|
|
'to': 'tftp',
|
|
'os': 'oe',
|
|
'kernel': {
|
|
'url': '{}/{}'.format(args.base_system_url_prefix, args.kernel_image_name),
|
|
},
|
|
'nfsrootfs': {
|
|
'url': '{}/lava-rootfs.tgz'.format(args.base_system_url_prefix),
|
|
'compression': 'gz',
|
|
}
|
|
}
|
|
if args.kernel_image_type:
|
|
deploy['kernel']['type'] = args.kernel_image_type
|
|
if args.dtb:
|
|
deploy['dtb'] = {
|
|
'url': '{}/{}.dtb'.format(args.base_system_url_prefix, args.dtb)
|
|
}
|
|
|
|
# always boot over NFS
|
|
boot = {
|
|
'timeout': { 'minutes': 25 },
|
|
'method': args.boot_method,
|
|
'commands': 'nfs',
|
|
'prompts': ['lava-shell:'],
|
|
}
|
|
|
|
# skeleton test definition: only declaring each job as a single 'test'
|
|
# since LAVA's test parsing is not useful to us
|
|
test = {
|
|
'timeout': { 'minutes': args.job_timeout },
|
|
'failure_retry': 1,
|
|
'definitions': [ {
|
|
'name': 'mesa',
|
|
'from': 'inline',
|
|
'path': 'inline/mesa.yaml',
|
|
'repository': {
|
|
'metadata': {
|
|
'name': 'mesa',
|
|
'description': 'Mesa test plan',
|
|
'os': [ 'oe' ],
|
|
'scope': [ 'functional' ],
|
|
'format': 'Lava-Test Test Definition 1.0',
|
|
},
|
|
'parse': {
|
|
'pattern': r'hwci: (?P<test_case_id>\S*):\s+(?P<result>(pass|fail))'
|
|
},
|
|
'run': {
|
|
},
|
|
},
|
|
} ],
|
|
}
|
|
|
|
# job execution script:
|
|
# - inline .gitlab-ci/common/init-stage1.sh
|
|
# - fetch and unpack per-pipeline build artifacts from build job
|
|
# - fetch and unpack per-job environment from lava-submit.sh
|
|
# - exec .gitlab-ci/common/init-stage2.sh
|
|
init_lines = []
|
|
|
|
with open(args.first_stage_init, 'r') as init_sh:
|
|
init_lines += [ x.rstrip() for x in init_sh if not x.startswith('#') and x.rstrip() ]
|
|
|
|
with open(args.jwt_file) as jwt_file:
|
|
init_lines += [
|
|
"set +x",
|
|
f'echo -n "{jwt_file.read()}" > "{args.jwt_file}" # HIDEME',
|
|
"set -x",
|
|
]
|
|
|
|
init_lines += [
|
|
'mkdir -p {}'.format(args.ci_project_dir),
|
|
'wget -S --progress=dot:giga -O- {} | tar -xz -C {}'.format(args.mesa_build_url, args.ci_project_dir),
|
|
'wget -S --progress=dot:giga -O- {} | tar -xz -C /'.format(args.job_rootfs_overlay_url),
|
|
f'echo "export CI_JOB_JWT_FILE={args.jwt_file}" >> /set-job-env-vars.sh',
|
|
'exec /init-stage2.sh',
|
|
]
|
|
test['definitions'][0]['repository']['run']['steps'] = init_lines
|
|
|
|
values['actions'] = [
|
|
{ 'deploy': deploy },
|
|
{ 'boot': boot },
|
|
{ 'test': test },
|
|
]
|
|
|
|
return yaml.dump(values, width=10000000)
|
|
|
|
|
|
def setup_lava_proxy():
|
|
config = lavacli.load_config("default")
|
|
uri, usr, tok = (config.get(key) for key in ("uri", "username", "token"))
|
|
uri_obj = urllib.parse.urlparse(uri)
|
|
uri_str = "{}://{}:{}@{}{}".format(uri_obj.scheme, usr, tok, uri_obj.netloc, uri_obj.path)
|
|
transport = lavacli.RequestsTransport(
|
|
uri_obj.scheme,
|
|
config.get("proxy"),
|
|
config.get("timeout", 120.0),
|
|
config.get("verify_ssl_cert", True),
|
|
)
|
|
proxy = xmlrpc.client.ServerProxy(
|
|
uri_str, allow_none=True, transport=transport)
|
|
|
|
print_log("Proxy for {} created.".format(config['uri']))
|
|
|
|
return proxy
|
|
|
|
|
|
def _call_proxy(fn, *args):
|
|
retries = 60
|
|
for n in range(1, retries + 1):
|
|
try:
|
|
return fn(*args)
|
|
except xmlrpc.client.ProtocolError as err:
|
|
if n == retries:
|
|
traceback.print_exc()
|
|
fatal_err("A protocol error occurred (Err {} {})".format(err.errcode, err.errmsg))
|
|
else:
|
|
time.sleep(15)
|
|
except xmlrpc.client.Fault as err:
|
|
traceback.print_exc()
|
|
fatal_err("FATAL: Fault: {} (code: {})".format(err.faultString, err.faultCode))
|
|
|
|
|
|
def get_job_results(proxy, job_id, test_suite, test_case):
|
|
# Look for infrastructure errors and retry if we see them.
|
|
results_yaml = _call_proxy(proxy.results.get_testjob_results_yaml, job_id)
|
|
results = yaml.load(results_yaml, Loader=loader(False))
|
|
for res in results:
|
|
metadata = res["metadata"]
|
|
if "result" not in metadata or metadata["result"] != "fail":
|
|
continue
|
|
if 'error_type' in metadata and metadata['error_type'] == "Infrastructure":
|
|
print_log("LAVA job {} failed with Infrastructure Error. Retry.".format(job_id))
|
|
return False
|
|
if 'case' in metadata and metadata['case'] == "validate":
|
|
print_log("LAVA job {} failed validation (possible download error). Retry.".format(job_id))
|
|
return False
|
|
|
|
results_yaml = _call_proxy(proxy.results.get_testcase_results_yaml, job_id, test_suite, test_case)
|
|
results = yaml.load(results_yaml, Loader=loader(False))
|
|
if not results:
|
|
fatal_err("LAVA: no result for test_suite '{}', test_case '{}'".format(test_suite, test_case))
|
|
|
|
print_log("LAVA: result for test_suite '{}', test_case '{}': {}".format(test_suite, test_case, results[0]['result']))
|
|
if results[0]['result'] != 'pass':
|
|
fatal_err("FAIL")
|
|
|
|
return True
|
|
|
|
def wait_until_job_is_started(proxy, job_id):
|
|
print_log(f"Waiting for job {job_id} to start.")
|
|
current_state = "Submitted"
|
|
waiting_states = ["Submitted", "Scheduling", "Scheduled"]
|
|
while current_state in waiting_states:
|
|
job_state = _call_proxy(proxy.scheduler.job_state, job_id)
|
|
current_state = job_state["job_state"]
|
|
|
|
time.sleep(WAIT_FOR_DEVICE_POLLING_TIME_SEC)
|
|
print_log(f"Job {job_id} started.")
|
|
|
|
def follow_job_execution(proxy, job_id):
|
|
line_count = 0
|
|
finished = False
|
|
last_time_logs = datetime.now()
|
|
while not finished:
|
|
(finished, data) = _call_proxy(proxy.scheduler.jobs.logs, job_id, line_count)
|
|
if logs := yaml.load(str(data), Loader=loader(False)):
|
|
# Reset the timeout
|
|
last_time_logs = datetime.now()
|
|
for line in logs:
|
|
print("{} {}".format(line["dt"], line["msg"]))
|
|
|
|
line_count += len(logs)
|
|
|
|
else:
|
|
time_limit = timedelta(seconds=DEVICE_HANGING_TIMEOUT_SEC)
|
|
if datetime.now() - last_time_logs > time_limit:
|
|
print_log("LAVA job {} doesn't advance (machine got hung?). Retry.".format(job_id))
|
|
return False
|
|
|
|
# `proxy.scheduler.jobs.logs` does not block, even when there is no
|
|
# new log to be fetched. To avoid dosing the LAVA dispatcher
|
|
# machine, let's add a sleep to save them some stamina.
|
|
time.sleep(LOG_POLLING_TIME_SEC)
|
|
|
|
return True
|
|
|
|
def show_job_data(proxy, job_id):
|
|
show = _call_proxy(proxy.scheduler.jobs.show, job_id)
|
|
for field, value in show.items():
|
|
print("{}\t: {}".format(field, value))
|
|
|
|
|
|
def validate_job(proxy, job_file):
|
|
try:
|
|
return _call_proxy(proxy.scheduler.jobs.validate, job_file, True)
|
|
except:
|
|
return False
|
|
|
|
def submit_job(proxy, job_file):
|
|
return _call_proxy(proxy.scheduler.jobs.submit, job_file)
|
|
|
|
|
|
def retriable_follow_job(proxy, yaml_file):
|
|
retry_count = NUMBER_OF_RETRIES_TIMEOUT_DETECTION
|
|
|
|
while retry_count >= 0:
|
|
job_id = submit_job(proxy, yaml_file)
|
|
|
|
print_log("LAVA job id: {}".format(job_id))
|
|
|
|
wait_until_job_is_started(proxy, job_id)
|
|
|
|
if not follow_job_execution(proxy, job_id):
|
|
print_log(f"Job {job_id} has timed out. Cancelling it.")
|
|
# Cancel the job as it is considered unreachable by Mesa CI.
|
|
proxy.scheduler.jobs.cancel(job_id)
|
|
|
|
retry_count -= 1
|
|
continue
|
|
|
|
show_job_data(proxy, job_id)
|
|
|
|
if get_job_results(proxy, job_id, "0_mesa", "mesa") == True:
|
|
break
|
|
else:
|
|
# The script attempted all the retries. The job seemed to fail.
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def main(args):
|
|
proxy = setup_lava_proxy()
|
|
|
|
yaml_file = generate_lava_yaml(args)
|
|
|
|
if args.dump_yaml:
|
|
print(hide_sensitive_data(generate_lava_yaml(args)))
|
|
|
|
if args.validate_only:
|
|
ret = validate_job(proxy, yaml_file)
|
|
if not ret:
|
|
fatal_err("Error in LAVA job definition")
|
|
print("LAVA job definition validated successfully")
|
|
return
|
|
|
|
if not retriable_follow_job(proxy, yaml_file):
|
|
fatal_err(
|
|
"Job failed after it exceeded the number of"
|
|
f"{NUMBER_OF_RETRIES_TIMEOUT_DETECTION} retries."
|
|
)
|
|
|
|
|
|
def create_parser():
|
|
parser = argparse.ArgumentParser("LAVA job submitter")
|
|
|
|
parser.add_argument("--pipeline-info")
|
|
parser.add_argument("--base-system-url-prefix")
|
|
parser.add_argument("--mesa-build-url")
|
|
parser.add_argument("--job-rootfs-overlay-url")
|
|
parser.add_argument("--job-artifacts-base")
|
|
parser.add_argument("--job-timeout", type=int)
|
|
parser.add_argument("--first-stage-init")
|
|
parser.add_argument("--ci-project-dir")
|
|
parser.add_argument("--device-type")
|
|
parser.add_argument("--dtb", nargs='?', default="")
|
|
parser.add_argument("--kernel-image-name")
|
|
parser.add_argument("--kernel-image-type", nargs='?', default="")
|
|
parser.add_argument("--boot-method")
|
|
parser.add_argument("--lava-tags", nargs='?', default="")
|
|
parser.add_argument("--jwt-file", type=pathlib.Path)
|
|
parser.add_argument("--validate-only", action='store_true')
|
|
parser.add_argument("--dump-yaml", action='store_true')
|
|
parser.add_argument("--visibility-group")
|
|
|
|
return parser
|
|
|
|
if __name__ == "__main__":
|
|
# given that we proxy from DUT -> LAVA dispatcher -> LAVA primary -> us ->
|
|
# GitLab runner -> GitLab primary -> user, safe to say we don't need any
|
|
# more buffering
|
|
sys.stdout.reconfigure(line_buffering=True)
|
|
sys.stderr.reconfigure(line_buffering=True)
|
|
|
|
parser = create_parser()
|
|
|
|
parser.set_defaults(func=main)
|
|
args = parser.parse_args()
|
|
args.func(args)
|