Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Syslog for transceiver high/low temperature alarm #465

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions sonic-xcvrd/tests/test_xcvrd.py
Original file line number Diff line number Diff line change
Expand Up @@ -1892,6 +1892,7 @@ def test_DomInfoUpdateTask_task_worker(self, mock_post_pm_info, mock_update_stat
task.task_stopping_event.wait = MagicMock(side_effect=[False, True])
task.get_dom_polling_from_config_db = MagicMock(return_value='enabled')
task.is_port_in_cmis_terminal_state = MagicMock(return_value=False)
task.check_transceiver_temperature = MagicMock()
mock_detect_error.return_value = True
task.task_worker()
assert task.port_mapping.logical_port_list.count('Ethernet0')
Expand All @@ -1911,6 +1912,48 @@ def test_DomInfoUpdateTask_task_worker(self, mock_post_pm_info, mock_update_stat
assert mock_update_status_hw.call_count == 1
assert mock_post_pm_info.call_count == 1

@patch('xcvrd.xcvrd_utilities.port_event_helper.PortMapping.logical_port_name_to_physical_port_list', MagicMock(return_value=[0]))
@patch('xcvrd.xcvrd._wrapper_get_presence', MagicMock(return_value=True))
@pytest.mark.parametrize("dom_info_cache, dom_th_info, expected", [
({0: {'temperature': '75'}},
(('temphighalarm', '80'),
('templowalarm', '0'),
('temphighwarning', '70'),
('templowwarning', '0')),
3), #TEMP_NORMAL = 0
({0: {'temperature': '85'}},
(('temphighalarm', '80'),
('templowalarm', '0'),
('temphighwarning', '70'),
('templowwarning', '10')),
1), #TEMP_HIGH_ALARM = 1
({0: {'temperature': '5'}},
(('temphighalarm', '80'),
('templowalarm', '0'),
('temphighwarning', '70'),
('templowwarning', '10')),
4), #TEMP_LOW_WARNING = 4
])
def test_check_transceiver_temperature(self, dom_info_cache, dom_th_info, expected):
class MockTable:
data = {}
def set(self, key, fvs):
self.data[key] = fvs

def get(self, key):
return self.data.get(key)

port_mapping = PortMapping()
stop_event = threading.Event()
mock_cmis_manager = MagicMock()
task = DomInfoUpdateTask(DEFAULT_NAMESPACE, port_mapping, stop_event, mock_cmis_manager)
logical_port_name = 'Ethernet0'
temperature_status = {}
dom_th_tbl = MockTable()
dom_th_tbl.get = MagicMock(return_value=(True, dom_th_info))
task.check_transceiver_temperature(logical_port_name, dom_th_tbl, dom_info_cache, temperature_status)
assert temperature_status[0] == expected

@patch('xcvrd.xcvrd._wrapper_get_presence', MagicMock(return_value=False))
@patch('xcvrd.xcvrd.XcvrTableHelper')
@patch('xcvrd.xcvrd.delete_port_from_status_table_hw')
Expand Down
61 changes: 61 additions & 0 deletions sonic-xcvrd/xcvrd/xcvrd.py
Original file line number Diff line number Diff line change
Expand Up @@ -1683,6 +1683,7 @@ def task_worker(self):
transceiver_status_cache = {}
pm_info_cache = {}
sel, asic_context = port_event_helper.subscribe_port_config_change(self.namespaces)
temperature_status = {}

# Start loop to update dom info in DB periodically
while not self.task_stopping_event.wait(DOM_INFO_UPDATE_PERIOD_SECS):
Expand Down Expand Up @@ -1735,6 +1736,8 @@ def task_worker(self):
helper_logger.log_warning("Got exception {} while processing pm info for port {}, ignored".format(repr(e), logical_port_name))
continue

self.check_transceiver_temperature(logical_port_name, self.xcvr_table_helper.get_dom_threshold_tbl(asic_index), dom_info_cache, temperature_status)

helper_logger.log_info("Stop DOM monitoring loop")

def run(self):
Expand All @@ -1758,6 +1761,64 @@ def join(self):
if self.exc:
raise self.exc

def check_transceiver_temperature(self, logical_port_name, th_table, dom_info_cache, temperature_status):
TEMP_NORMAL = 0
TEMP_HIGH_ALARM = 1
TEMP_LOW_ALARM = 2
TEMP_HIGH_WARNING = 3
TEMP_LOW_WARNING = 4

TEMP_ERROR_TO_DESCRIPTION_DICT = {
TEMP_NORMAL: "temperature normal",
TEMP_HIGH_ALARM: "temperature high alarm",
TEMP_LOW_ALARM: "temperature low alarm",
TEMP_HIGH_WARNING: "temperature high warning",
TEMP_LOW_WARNING: "temperature low warning"
}

for physical_port, physical_port_name in get_physical_port_name_dict(logical_port_name, self.port_mapping).items():
orig_temp_status = temperature_status.get(physical_port)
if orig_temp_status is None:
orig_temp_status = TEMP_NORMAL
temperature_status[physical_port] = orig_temp_status
new_temp_status = TEMP_NORMAL

dom_info_dict = dom_info_cache.get(physical_port)
presence, threshold = th_table.get(physical_port_name)
if presence:
dom_th_info_dict = dict(threshold)
else:
dom_th_info_dict = None
if dom_info_dict is not None and dom_th_info_dict is not None:
temperature = dom_info_dict.get("temperature")
temphighalarm = dom_th_info_dict.get("temphighalarm")
templowalarm = dom_th_info_dict.get("templowalarm")
temphighwarning = dom_th_info_dict.get("temphighwarning")
templowwarning = dom_th_info_dict.get("templowwarning")
if temperature != 'N/A' and temphighalarm != 'N/A' and templowalarm != 'N/A' and \
temphighwarning != 'N/A' and templowwarning != 'N/A':
if float(temperature) > float(temphighalarm):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@chiourung Can you please update the PR description with the snippet from the C_CMIS spec showing the shutdown behavior of the module due to high temperature.
Also, module shuts down only if current module temperature > temphighalarm or temphighwarning?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@chiourung Does the module not shutdown if temperature) == temphighalarm?

new_temp_status = TEMP_HIGH_ALARM
elif float(temperature) > float(temphighwarning):
new_temp_status = TEMP_HIGH_WARNING
elif float(temperature) < float(templowalarm):
new_temp_status = TEMP_LOW_ALARM
elif float(temperature) < float(templowwarning):
new_temp_status = TEMP_LOW_WARNING
else:
new_temp_status = TEMP_NORMAL
Comment on lines +1801 to +1809
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@chiourung Instead of Xcvrd doing the comparison, there are module flags for low/high warning thresholds in byte 9. Why not use those latched flag?

image

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@prgeor These flags are COR (Clear on Read). Cannot be used to determine if the temperature is from alarm/warning to normal. The comparison of the temperature can be kept to record when the temperature is alarm / warning.


if orig_temp_status != new_temp_status:
temperature_status[physical_port] = new_temp_status
helper_logger.log_notice("{}: temperature status changed from {} to {}".format(
physical_port_name,
TEMP_ERROR_TO_DESCRIPTION_DICT[orig_temp_status],
TEMP_ERROR_TO_DESCRIPTION_DICT[new_temp_status]))
elif new_temp_status > 0:
helper_logger.log_notice("{}: {}".format(physical_port_name, TEMP_ERROR_TO_DESCRIPTION_DICT[new_temp_status]))
else:
temperature_status[physical_port] = TEMP_NORMAL

def on_port_config_change(self, port_change_event):
if port_change_event.event_type == port_event_helper.PortChangeEvent.PORT_REMOVE:
self.on_remove_logical_port(port_change_event)
Expand Down
Loading