Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/actions/build_and_test_ya/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,5 @@ runs:
test_retry_count: ${{ inputs.test_retry_count }}
custom_branch_name: ${{ inputs.custom_branch_name }}
add_vcs_info: ${{ inputs.add_vcs_info }}
telegram_ydbot_token: ${{ fromJSON( inputs.secs ).TELEGRAM_YDBOT_TOKEN || '' }}
telegram_alert_logins: ${{ fromJSON( inputs.vars ).GH_ALERTS_TG_LOGINS || '' }}
20 changes: 19 additions & 1 deletion .github/actions/test_ya/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,14 @@ inputs:
type: boolean
default: false
description: "add ya make vars with git info"
telegram_ydbot_token:
type: string
required: false
description: "Telegram bot token"
telegram_alert_logins:
type: string
required: false
description: "Telegram alert logins"

outputs:
success:
Expand Down Expand Up @@ -120,7 +128,7 @@ runs:
echo "SUMMARY_LINKS=$PUBLIC_DIR/summary_links.txt" >> $GITHUB_ENV
echo "BUILD_PRESET=${{ inputs.build_preset }}" >> $GITHUB_ENV

python3 -m pip install ydb ydb[yc] codeowners humanize junitparser
python3 -m pip install ydb ydb[yc] codeowners humanize junitparser plotly

- name: Setup cache
shell: bash
Expand All @@ -132,6 +140,9 @@ runs:
- name: ya build and test
id: build
shell: bash
env:
TELEGRAM_BOT_TOKEN: ${{ inputs.telegram_ydbot_token }}
GH_ALERTS_TG_LOGINS: ${{ inputs.telegram_alert_logins }}
run: |
set -ex
echo "Artifacts will be uploaded [here](${PUBLIC_DIR_URL}/index.html)" | GITHUB_TOKEN="${{ github.token }}" .github/scripts/tests/comment-pr.py
Expand Down Expand Up @@ -345,6 +356,13 @@ runs:

.github/scripts/tests/report_analyzer.py --report_file "$CURRENT_REPORT" --summary_file $CURRENT_PUBLIC_DIR/summary_report.txt || true

.github/scripts/report_ram_analyzer.py \
--report-file "$CURRENT_REPORT" \
--output-file $CURRENT_PUBLIC_DIR/ram_report.html \
--output-file-url $CURRENT_PUBLIC_DIR_URL/ram_report.html \
--chat-id -4792293503 \
--memory-threshold 10 || true

# convert to chromium trace
# seems analyze-make don't have simple "output" parameter, so change cwd
ya_dir=$(pwd)
Expand Down
246 changes: 246 additions & 0 deletions .github/scripts/report_ram_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
#!/usr/bin/env python3

import argparse
import json
from collections import defaultdict
import os
import subprocess
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime, timezone
from telegram.alert_queued_jobs import send_telegram_message, get_current_workflow_url, get_alert_logins


def timstamp_to_time(ts):
return datetime.fromtimestamp(ts, timezone.utc).strftime('%Y-%m-%d %H:%M:%S')


def get_total_runner_memory_in_gb():
cmd = [
'awk', '/MemTotal/ { printf "%.3f \\n", $2/1024/1024 }', '/proc/meminfo'
]
return float(subprocess.run(cmd, text=True, capture_output=True, timeout=60).stdout)


def calculate_total_memory_consumption(processes):
"""
Вычисляет суммарное потребление памяти для каждого момента времени.
Корректно работает с несортированными данными.

Args:
processes: список кортежей (rss_consumption, path, start_time, end_time)

Returns:
timeline: отсортированный список временных меток
memory_usage: список суммарного потребления памяти для каждой метки
"""
processes = sorted(processes, key=lambda x: x[2])
events = defaultdict(float)

for rss, path, start, end in processes:
events[start] += rss # При старте добавляем память
events[end] -= rss # При завершении убираем память

sorted_events = sorted(events.items(), key=lambda x: x[0])
timeline = []
memory_usage = []
current_memory = 0

for timestamp, delta in sorted_events:
current_memory += delta # Применяем изменение
timeline.append(timestamp)
memory_usage.append(round(current_memory, 2))
return timeline, memory_usage


def get_active_processes_at_time(processes, target_time):
"""
Возвращает список процессов, активных в указанный момент времени
"""
active = []
for rss, path, start, end in processes:
if start <= target_time < end:
active.append((rss, path, start, round(end)))
return active


def create_simple_interactive_plot(processes, output_file):
"""Упрощённая версия с hover-информацией"""
timeline, memory_usage = calculate_total_memory_consumption(processes)

# Создаём subplot с дополнительной информацией
fig = make_subplots(
rows=1, cols=1,
row_heights=[1,],
subplot_titles=('Memory Consumption',),
vertical_spacing=0.12
)

# Готовим hover-текст с информацией об активных процессах
hover_texts = []
process_counts = []
timeline_in_time = list(map(timstamp_to_time, timeline))
for t, mem in zip(timeline, memory_usage):
active = get_active_processes_at_time(processes, t)
process_counts.append(len(active))
test_suites = defaultdict(float)
for rss, path, _, _ in active:
test_suites[path.split(' ')[0]] += rss
test_suites = sorted(test_suites.items(),
key=lambda x: x[1], reverse=True)

hover_text = f"<b>Time:</b> {timstamp_to_time(t)}<br>"
hover_text += f"<b>Memory:</b> {mem} GB<br>"
hover_text += f"<b>Processes:</b> {len(active)}<br><br>"

if active:
hover_text += "<b>Top 5 Test Suites:</b><br>"
for suite, rss in test_suites[:5]:
hover_text += f" • {suite}: {round(rss, 2)} GB<br>"

hover_texts.append(hover_text)

# График памяти
fig.add_trace(
go.Scatter(
x=timeline_in_time,
y=memory_usage,
mode='lines',
name='Total RSS',
line=dict(shape='hv', width=1, color='rgb(46, 134, 171)'),
fill='tozeroy',
fillcolor='rgba(46, 134, 171, 0.3)',
hovertext=hover_texts,
hoverinfo='text'
),
row=1, col=1
)
fig.add_trace(
go.Scatter(
x=timeline_in_time,
y=process_counts,
mode='lines',
name='Active processes',
line=dict(shape='hv', width=1, color='rgb(171, 134, 46)'),
),
row=1, col=1
)

# Отмечаем пик
max_memory = max(memory_usage)
max_idx = memory_usage.index(max_memory)
max_time = timeline[max_idx]

if not output_file:
print(hover_texts[max_idx].replace('<br>', '\n'))

fig.add_trace(
go.Scatter(
x=[timstamp_to_time(max_time)],
y=[max_memory],
mode='markers+text',
marker=dict(size=15, color='red', symbol='star'),
text=[f'Peak: {max_memory} GB'],
textposition='top center',
name='Peak',
showlegend=False
),
row=1, col=1
)

fig.update_yaxes(title_text="Memory (GB)", row=1, col=1)

fig.update_layout(
height=800,
hovermode='x unified',
template='plotly_white',
title_text="Interactive Memory Consumption Monitor"
)
if output_file:
fig.write_html(output_file)
else:
fig.show()
return max_memory


def parse_report_file(report_json):
all = []
for result in report_json["results"]:
type_ = result["type"]
if type_ == "test" and result.get("chunk"):
rss_consumtion = result["metrics"].get(
"suite_max_proc_tree_memory_consumption_kb", 0) / 1024 / 1024
start_time = result["metrics"].get('suite_start_timestamp', 0)
end_time = start_time + result["metrics"].get("wall_time", 0)
path = result["path"] + " " + result.get("subtest_name", "")
all.append((rss_consumtion, path, start_time, end_time))
return all


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--report-file",
help="path to file received via 'ya make ... --build-results-report <file>'",
type=argparse.FileType("r"),
)
parser.add_argument(
"--output-file",
help="path to graph file"
)
parser.add_argument(
"--output-file-url",
help="Path to graph file in run artifacts"
)
parser.add_argument('--dry-run', action='store_true',
help='Debug mode without sending to Telegram')
parser.add_argument('--bot-token',
help='Telegram bot token (or use TELEGRAM_BOT_TOKEN env var)')
parser.add_argument('--chat-id',
help='Telegram chat ID')
parser.add_argument('--channel',
help='Telegram channel ID (alternative to --chat-id)')
parser.add_argument('--thread-id', type=int,
help='Telegram thread ID for group messages')
parser.add_argument('--memory-threshold', type=float,
help='Threshold for used memory in percent. Default = 90',
default=90)
args = parser.parse_args()

report_file = args.report_file
obj = json.load(report_file)
all = parse_report_file(obj)
output_file = args.output_file

# Draw or export fig with RAM usage
max_used_ram = create_simple_interactive_plot(all, output_file)

max_agent_ram = get_total_runner_memory_in_gb()
max_agent_ram_with_threshold = max_agent_ram * (args.memory_threshold / 100)
if max_used_ram > max_agent_ram_with_threshold:
print(f"Max used RAM {max_used_ram} is greater than max agent RAM {max_agent_ram}")

bot_token = args.bot_token or os.getenv('TELEGRAM_BOT_TOKEN')
chat_id = args.channel or args.chat_id or os.getenv('TELEGRAM_CHAT_ID')
thread_id = args.thread_id or os.getenv('TELEGRAM_THREAD_ID')
dry_run = args.dry_run or os.getenv('DRY_RUN', 'false').lower() == 'true'

message = f"""🚨 *Possible OOM*
During [RUN]({get_current_workflow_url()}) max used RAM *{round(max_used_ram, 1)}GB* is greater than agent RAM *{round(max_agent_ram_with_threshold, 1)}GB*
{max_agent_ram}GB total
Threshold is {args.memory_threshold}%

[Ram usage graph]({args.output_file_url})
CC {get_alert_logins()}"""
if dry_run:
print(message)
else:
if chat_id and not chat_id.startswith('-') and len(chat_id) >= 10:
# Add -100 prefix for supergroup
chat_id = f"-100{chat_id}"
send_telegram_message(
bot_token,
chat_id,
message,
thread_id,
"MarkdownV2")
8 changes: 4 additions & 4 deletions .github/workflows/run_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ jobs:
custom_branch_name: ${{ matrix.branch }}
put_build_results_to_cache: true
additional_ya_make_args: -DDEBUGINFO_LINES_ONLY ${{ inputs.additional_ya_make_args }}
secs: ${{ format('{{"AWS_KEY_ID":"{0}","AWS_KEY_VALUE":"{1}","REMOTE_CACHE_USERNAME":"{2}","REMOTE_CACHE_PASSWORD":"{3}"}}',
secrets.AWS_KEY_ID, secrets.AWS_KEY_VALUE, secrets.REMOTE_CACHE_USERNAME, secrets.REMOTE_CACHE_PASSWORD ) }}
vars: ${{ format('{{"AWS_BUCKET":"{0}","AWS_ENDPOINT":"{1}","REMOTE_CACHE_URL":"{2}"}}',
vars.AWS_BUCKET, vars.AWS_ENDPOINT, vars.REMOTE_CACHE_URL_YA ) }}
secs: ${{ format('{{"AWS_KEY_ID":"{0}","AWS_KEY_VALUE":"{1}","REMOTE_CACHE_USERNAME":"{2}","REMOTE_CACHE_PASSWORD":"{3}","TELEGRAM_YDBOT_TOKEN":"{4}"}}',
secrets.AWS_KEY_ID, secrets.AWS_KEY_VALUE, secrets.REMOTE_CACHE_USERNAME, secrets.REMOTE_CACHE_PASSWORD, secrets.TELEGRAM_YDBOT_TOKEN ) }}
vars: ${{ format('{{"AWS_BUCKET":"{0}","AWS_ENDPOINT":"{1}","REMOTE_CACHE_URL":"{2}","GH_ALERTS_TG_LOGINS":"{3}"}}',
vars.AWS_BUCKET, vars.AWS_ENDPOINT, vars.REMOTE_CACHE_URL_YA, vars.GH_ALERTS_TG_LOGINS ) }}