ci/lava: Detect R8152 issues preemptively and retry
Implement a log-based retry hint for R8152 issue described in #6681, which is based on detecting these two consecutive lines: ``` r8152 <USB> eth0: Tx status -71 nfs: server <IP> not responding, still trying ``` Where <IP> and <USB> could be any IP and USB addresses, respectfully. This commit is a temporary fix since it requires a section-aware log follower, implemented in !16323. When the cited MR is merged, one will make a proper fix on top of that. Closes: #6681 Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17389>
This commit is contained in:
parent
45a4b01427
commit
2c51b7a9c9
|
@ -19,3 +19,10 @@ class MesaCIRetryError(MesaCIException):
|
||||||
|
|
||||||
class MesaCIParseException(MesaCIException):
|
class MesaCIParseException(MesaCIException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class MesaCIKnownIssueException(MesaCIException):
|
||||||
|
"""Exception raised when the Mesa CI script finds something in the logs that
|
||||||
|
is known to cause the LAVA job to eventually fail"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
|
@ -0,0 +1,40 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from lava.utils import LogFollower
|
||||||
|
|
||||||
|
from lava.exceptions import MesaCIKnownIssueException
|
||||||
|
from lava.utils.log_section import LogSectionType
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LAVALogHints:
|
||||||
|
log_follower: LogFollower
|
||||||
|
has_r8152_issue_history: bool = field(default=False, init=False)
|
||||||
|
|
||||||
|
def detect_failure(self, new_lines: list[dict[str, Any]]):
|
||||||
|
for line in new_lines:
|
||||||
|
self.detect_r8152_issue(line)
|
||||||
|
|
||||||
|
def detect_r8152_issue(self, line):
|
||||||
|
if (
|
||||||
|
self.log_follower.phase == LogSectionType.TEST_CASE
|
||||||
|
and line["lvl"] == "target"
|
||||||
|
):
|
||||||
|
if re.search(r"r8152 \S+ eth0: Tx status -71", line["msg"]):
|
||||||
|
self.has_r8152_issue_history = True
|
||||||
|
return
|
||||||
|
|
||||||
|
if self.has_r8152_issue_history and re.search(
|
||||||
|
r"nfs: server \d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} not responding, still trying",
|
||||||
|
line["msg"],
|
||||||
|
):
|
||||||
|
raise MesaCIKnownIssueException(
|
||||||
|
"Probable network issue failure encountered, retrying the job"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.has_r8152_issue_history = False
|
|
@ -20,6 +20,7 @@ from typing import Optional, Union
|
||||||
from lava.exceptions import MesaCITimeoutError
|
from lava.exceptions import MesaCITimeoutError
|
||||||
from lava.utils.console_format import CONSOLE_LOG
|
from lava.utils.console_format import CONSOLE_LOG
|
||||||
from lava.utils.gitlab_section import GitlabSection
|
from lava.utils.gitlab_section import GitlabSection
|
||||||
|
from lava.utils.lava_log_hints import LAVALogHints
|
||||||
from lava.utils.log_section import (
|
from lava.utils.log_section import (
|
||||||
DEFAULT_GITLAB_SECTION_TIMEOUTS,
|
DEFAULT_GITLAB_SECTION_TIMEOUTS,
|
||||||
FALLBACK_GITLAB_SECTION_TIMEOUT,
|
FALLBACK_GITLAB_SECTION_TIMEOUT,
|
||||||
|
@ -36,12 +37,14 @@ class LogFollower:
|
||||||
)
|
)
|
||||||
fallback_timeout: timedelta = FALLBACK_GITLAB_SECTION_TIMEOUT
|
fallback_timeout: timedelta = FALLBACK_GITLAB_SECTION_TIMEOUT
|
||||||
_buffer: list[str] = field(default_factory=list, init=False)
|
_buffer: list[str] = field(default_factory=list, init=False)
|
||||||
|
log_hints: LAVALogHints = field(init=False)
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
section_is_created = bool(self.current_section)
|
section_is_created = bool(self.current_section)
|
||||||
section_has_started = bool(
|
section_has_started = bool(
|
||||||
self.current_section and self.current_section.has_started
|
self.current_section and self.current_section.has_started
|
||||||
)
|
)
|
||||||
|
self.log_hints = LAVALogHints(self)
|
||||||
assert (
|
assert (
|
||||||
section_is_created == section_has_started
|
section_is_created == section_has_started
|
||||||
), "Can't follow logs beginning from uninitialized GitLab sections."
|
), "Can't follow logs beginning from uninitialized GitLab sections."
|
||||||
|
@ -138,6 +141,8 @@ class LogFollower:
|
||||||
if parsed_line := parse_lava_line(line):
|
if parsed_line := parse_lava_line(line):
|
||||||
self._buffer.append(parsed_line)
|
self._buffer.append(parsed_line)
|
||||||
|
|
||||||
|
self.log_hints.detect_failure(new_lines)
|
||||||
|
|
||||||
return is_job_healthy
|
return is_job_healthy
|
||||||
|
|
||||||
def flush(self) -> list[str]:
|
def flush(self) -> list[str]:
|
||||||
|
|
|
@ -9,7 +9,7 @@ from datetime import datetime, timedelta
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import yaml
|
import yaml
|
||||||
from lava.exceptions import MesaCITimeoutError
|
from lava.exceptions import MesaCIKnownIssueException, MesaCITimeoutError
|
||||||
from lava.utils import (
|
from lava.utils import (
|
||||||
GitlabSection,
|
GitlabSection,
|
||||||
LogFollower,
|
LogFollower,
|
||||||
|
@ -290,3 +290,60 @@ def test_gitlab_section_id(case_name, expected_id):
|
||||||
|
|
||||||
assert gl.id == expected_id
|
assert gl.id == expected_id
|
||||||
|
|
||||||
|
|
||||||
|
A618_NETWORK_ISSUE_LOGS = [
|
||||||
|
create_lava_yaml_msg(
|
||||||
|
msg="[ 1733.599402] r8152 2-1.3:1.0 eth0: Tx status -71", lvl="target"
|
||||||
|
),
|
||||||
|
create_lava_yaml_msg(
|
||||||
|
msg="[ 1733.604506] nfs: server 192.168.201.1 not responding, still trying",
|
||||||
|
lvl="target",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
TEST_PHASE_LAVA_SIGNAL = create_lava_yaml_msg(
|
||||||
|
msg="Received signal: <STARTTC> mesa-ci_a618_vk", lvl="debug"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
A618_NETWORK_ISSUE_SCENARIOS = {
|
||||||
|
"Pass - R8152 kmsg during boot": (A618_NETWORK_ISSUE_LOGS, does_not_raise()),
|
||||||
|
"Fail - R8152 kmsg during test phase": (
|
||||||
|
[TEST_PHASE_LAVA_SIGNAL, *A618_NETWORK_ISSUE_LOGS],
|
||||||
|
pytest.raises(MesaCIKnownIssueException),
|
||||||
|
),
|
||||||
|
"Pass - Partial (1) R8152 kmsg during test phase": (
|
||||||
|
[TEST_PHASE_LAVA_SIGNAL, A618_NETWORK_ISSUE_LOGS[0]],
|
||||||
|
does_not_raise(),
|
||||||
|
),
|
||||||
|
"Pass - Partial (2) R8152 kmsg during test phase": (
|
||||||
|
[TEST_PHASE_LAVA_SIGNAL, A618_NETWORK_ISSUE_LOGS[1]],
|
||||||
|
does_not_raise(),
|
||||||
|
),
|
||||||
|
"Pass - Partial subsequent (3) R8152 kmsg during test phase": (
|
||||||
|
[
|
||||||
|
TEST_PHASE_LAVA_SIGNAL,
|
||||||
|
A618_NETWORK_ISSUE_LOGS[0],
|
||||||
|
A618_NETWORK_ISSUE_LOGS[0],
|
||||||
|
],
|
||||||
|
does_not_raise(),
|
||||||
|
),
|
||||||
|
"Pass - Partial subsequent (4) R8152 kmsg during test phase": (
|
||||||
|
[
|
||||||
|
TEST_PHASE_LAVA_SIGNAL,
|
||||||
|
A618_NETWORK_ISSUE_LOGS[1],
|
||||||
|
A618_NETWORK_ISSUE_LOGS[1],
|
||||||
|
],
|
||||||
|
does_not_raise(),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"messages, expectation",
|
||||||
|
A618_NETWORK_ISSUE_SCENARIOS.values(),
|
||||||
|
ids=A618_NETWORK_ISSUE_SCENARIOS.keys(),
|
||||||
|
)
|
||||||
|
def test_detect_failure(messages, expectation):
|
||||||
|
lf = LogFollower()
|
||||||
|
with expectation:
|
||||||
|
lf.feed(messages)
|
||||||
|
|
Loading…
Reference in New Issue