From 2c51b7a9c975199ea22b560efd2e2da94cdab47f Mon Sep 17 00:00:00 2001 From: Guilherme Gallo Date: Wed, 6 Jul 2022 22:59:11 -0300 Subject: [PATCH] ci/lava: Detect R8152 issues preemptively and retry Implement a log-based retry hint for R8152 issue described in #6681, which is based on detecting these two consecutive lines: ``` r8152 eth0: Tx status -71 nfs: server not responding, still trying ``` Where and could be any IP and USB addresses, respectfully. This commit is a temporary fix since it requires a section-aware log follower, implemented in !16323. When the cited MR is merged, one will make a proper fix on top of that. Closes: #6681 Signed-off-by: Guilherme Gallo Part-of: --- .gitlab-ci/lava/exceptions.py | 7 +++ .gitlab-ci/lava/utils/lava_log_hints.py | 40 +++++++++++++++++ .gitlab-ci/lava/utils/log_follower.py | 5 +++ .gitlab-ci/tests/utils/test_lava_log.py | 59 ++++++++++++++++++++++++- 4 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 .gitlab-ci/lava/utils/lava_log_hints.py diff --git a/.gitlab-ci/lava/exceptions.py b/.gitlab-ci/lava/exceptions.py index 1c1db591897..3c9a63eb3c6 100644 --- a/.gitlab-ci/lava/exceptions.py +++ b/.gitlab-ci/lava/exceptions.py @@ -19,3 +19,10 @@ class MesaCIRetryError(MesaCIException): class MesaCIParseException(MesaCIException): pass + + +class MesaCIKnownIssueException(MesaCIException): + """Exception raised when the Mesa CI script finds something in the logs that + is known to cause the LAVA job to eventually fail""" + + pass diff --git a/.gitlab-ci/lava/utils/lava_log_hints.py b/.gitlab-ci/lava/utils/lava_log_hints.py new file mode 100644 index 00000000000..12317bb1127 --- /dev/null +++ b/.gitlab-ci/lava/utils/lava_log_hints.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from lava.utils import LogFollower + +from lava.exceptions import MesaCIKnownIssueException +from lava.utils.log_section import LogSectionType + + +@dataclass +class LAVALogHints: + log_follower: LogFollower + has_r8152_issue_history: bool = field(default=False, init=False) + + def detect_failure(self, new_lines: list[dict[str, Any]]): + for line in new_lines: + self.detect_r8152_issue(line) + + def detect_r8152_issue(self, line): + if ( + self.log_follower.phase == LogSectionType.TEST_CASE + and line["lvl"] == "target" + ): + if re.search(r"r8152 \S+ eth0: Tx status -71", line["msg"]): + self.has_r8152_issue_history = True + return + + if self.has_r8152_issue_history and re.search( + r"nfs: server \d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} not responding, still trying", + line["msg"], + ): + raise MesaCIKnownIssueException( + "Probable network issue failure encountered, retrying the job" + ) + + self.has_r8152_issue_history = False diff --git a/.gitlab-ci/lava/utils/log_follower.py b/.gitlab-ci/lava/utils/log_follower.py index 532610dfba2..d08923bf07b 100644 --- a/.gitlab-ci/lava/utils/log_follower.py +++ b/.gitlab-ci/lava/utils/log_follower.py @@ -20,6 +20,7 @@ from typing import Optional, Union from lava.exceptions import MesaCITimeoutError from lava.utils.console_format import CONSOLE_LOG from lava.utils.gitlab_section import GitlabSection +from lava.utils.lava_log_hints import LAVALogHints from lava.utils.log_section import ( DEFAULT_GITLAB_SECTION_TIMEOUTS, FALLBACK_GITLAB_SECTION_TIMEOUT, @@ -36,12 +37,14 @@ class LogFollower: ) fallback_timeout: timedelta = FALLBACK_GITLAB_SECTION_TIMEOUT _buffer: list[str] = field(default_factory=list, init=False) + log_hints: LAVALogHints = field(init=False) def __post_init__(self): section_is_created = bool(self.current_section) section_has_started = bool( self.current_section and self.current_section.has_started ) + self.log_hints = LAVALogHints(self) assert ( section_is_created == section_has_started ), "Can't follow logs beginning from uninitialized GitLab sections." @@ -138,6 +141,8 @@ class LogFollower: if parsed_line := parse_lava_line(line): self._buffer.append(parsed_line) + self.log_hints.detect_failure(new_lines) + return is_job_healthy def flush(self) -> list[str]: diff --git a/.gitlab-ci/tests/utils/test_lava_log.py b/.gitlab-ci/tests/utils/test_lava_log.py index cc5e8eec9bd..1219cc9f830 100644 --- a/.gitlab-ci/tests/utils/test_lava_log.py +++ b/.gitlab-ci/tests/utils/test_lava_log.py @@ -9,7 +9,7 @@ from datetime import datetime, timedelta import pytest import yaml -from lava.exceptions import MesaCITimeoutError +from lava.exceptions import MesaCIKnownIssueException, MesaCITimeoutError from lava.utils import ( GitlabSection, LogFollower, @@ -290,3 +290,60 @@ def test_gitlab_section_id(case_name, expected_id): assert gl.id == expected_id + +A618_NETWORK_ISSUE_LOGS = [ + create_lava_yaml_msg( + msg="[ 1733.599402] r8152 2-1.3:1.0 eth0: Tx status -71", lvl="target" + ), + create_lava_yaml_msg( + msg="[ 1733.604506] nfs: server 192.168.201.1 not responding, still trying", + lvl="target", + ), +] +TEST_PHASE_LAVA_SIGNAL = create_lava_yaml_msg( + msg="Received signal: mesa-ci_a618_vk", lvl="debug" +) + + +A618_NETWORK_ISSUE_SCENARIOS = { + "Pass - R8152 kmsg during boot": (A618_NETWORK_ISSUE_LOGS, does_not_raise()), + "Fail - R8152 kmsg during test phase": ( + [TEST_PHASE_LAVA_SIGNAL, *A618_NETWORK_ISSUE_LOGS], + pytest.raises(MesaCIKnownIssueException), + ), + "Pass - Partial (1) R8152 kmsg during test phase": ( + [TEST_PHASE_LAVA_SIGNAL, A618_NETWORK_ISSUE_LOGS[0]], + does_not_raise(), + ), + "Pass - Partial (2) R8152 kmsg during test phase": ( + [TEST_PHASE_LAVA_SIGNAL, A618_NETWORK_ISSUE_LOGS[1]], + does_not_raise(), + ), + "Pass - Partial subsequent (3) R8152 kmsg during test phase": ( + [ + TEST_PHASE_LAVA_SIGNAL, + A618_NETWORK_ISSUE_LOGS[0], + A618_NETWORK_ISSUE_LOGS[0], + ], + does_not_raise(), + ), + "Pass - Partial subsequent (4) R8152 kmsg during test phase": ( + [ + TEST_PHASE_LAVA_SIGNAL, + A618_NETWORK_ISSUE_LOGS[1], + A618_NETWORK_ISSUE_LOGS[1], + ], + does_not_raise(), + ), +} + + +@pytest.mark.parametrize( + "messages, expectation", + A618_NETWORK_ISSUE_SCENARIOS.values(), + ids=A618_NETWORK_ISSUE_SCENARIOS.keys(), +) +def test_detect_failure(messages, expectation): + lf = LogFollower() + with expectation: + lf.feed(messages)