-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
179 lines (147 loc) · 6.51 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
### 数据预处理
# 1. 数据清洗
# 2. 数据填补
# 3. 数据归一化
## 注意数据质控、数据清洗、数据填补都需要对业务有一定的了解
## 数据归一化代谢大多根据QC归一化 转录组则有较多方式 FPKM TPM需要去了解
## 核心问题,当我们做了这些工作后 样本与样本之间是否可比?指标与指标之间是否可比?
### 样本基本趋势可视化
# 热图
# PCA图
### 分组差异分析与可视化
# 差异分析(两组 多组) t wilcox DEseq2 limma等等(需要学习底层原理)
# 差异代谢物 gene拿到之后可视化有火山图、热图、相关性热图、箱型图等
# 接着就是差异代谢物的生物学意义探索富集分析
### 富集分析
## ORA GESA GRSA
## https://github.com/Asa12138/ReporterScore
# 1. 富集分析背景集(基因和代谢物)下载
## R见这里 https://mp.weixin.qq.com/s/NjlRMqEmD5OqMrYsmKy6Iw
## 实际上是通过这个网页https://rest.kegg.jp/get/hsa05230/kgml/去处理的
## https://rest.kegg.jp/get/hsa05230 这个网页较为麻烦需要正则去处理了(也不好处理)
# 2. 富集分析通路下载(后续pathview画图,染色)
import requests
import xml.etree.ElementTree as ET
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
# 下载并解析KGML文件
def get_kgml(pathway_id):
url = f"https://rest.kegg.jp/get/{pathway_id}/kgml"
response = requests.get(url)
return response.text
# 解析KGML文件中的节点坐标信息
def parse_kgml(kgml_content, target_ids):
root = ET.fromstring(kgml_content)
coords = {}
for entry in root.findall(".//entry"):
entry_id = entry.get('name')
if entry_id and any(target in entry_id for target in target_ids):
graphics = entry.find(".//graphics")
if graphics is not None:
x = float(graphics.get('x'))
y = float(graphics.get('y'))
coords[entry_id] = (x, y)
return coords
# 下载通路图像
def get_image(pathway_id):
url = f"https://rest.kegg.jp/get/{pathway_id}/image"
response = requests.get(url)
img_file = f"{pathway_id}.png"
with open(img_file, 'wb') as f:
f.write(response.content)
return img_file
# 在图像上绘制标记
def draw_markers(img_file, coords, regulation_status):
img = Image.open(img_file)
draw = ImageDraw.Draw(img)
# 设定标记的颜色
color_map = {'upregulated': 'red', 'downregulated': 'blue'}
for id, (x, y) in coords.items():
base_id = id.replace('cpd:', '')
print(f"Processing ID: {id}, Base ID: {base_id}, Coordinates: ({x}, {y})") # 调试信息
# 确保状态存在,默认为下调
status = regulation_status.get(base_id , 'downregulated') # 默认状态为下调
color = color_map.get(status, 'blue')
draw.ellipse((x - 5, y - 5, x + 5, y + 5), outline='black', fill=color)
# modified_img_file = "modified_image.png"
# img.save(modified_img_file, dpi=(600, 600))
# draw.ellipse((x - 5, y - 5, x + 5, y + 5), outline='black', fill=color)
modified_img_file = "modified_image.png"
img.save(modified_img_file, dpi=(600, 600)) # 保存图像,设置dpi为600
return modified_img_file
# 无法处理基因 可以创建一个矩型的辅助函数
# 示例:获取并修改mmu00010的图像,标记上调和下调的代谢物
pathway_id = "mmu00010"
target_ids = ["C01172", "C00022", "C00024"] # 多个代谢物
regulation_status = {
"C01172": "upregulated", # 上调
"C00022": "downregulated", # 下调
"C00024": "upregulated" # 上调
}
# 获取KGML内容并解析坐标
kgml_content = get_kgml(pathway_id)
coords = parse_kgml(kgml_content, target_ids)
print("Coordinates:", coords) # 打印坐标
# 获取并修改图像
img_file = get_image(pathway_id)
modified_img_file = draw_markers(img_file, coords, regulation_status)
# 显示修改后的图像
modified_img = Image.open(modified_img_file)
plt.imshow(modified_img)
plt.axis('off')
plt.show()
# 3. 富集分析通路分类信息下载
## 主要借鉴:https://blog.csdn.net/qq_38774801/article/details/125981487
## 当然也可以爬虫保存成网页处理,自行完善吧
import re
import pandas as pd
# 读取文件
with open("a.txt", "r", encoding="utf-8") as file:
lines = file.readlines()
print(len(lines))
# 提取包含有效信息的行
filtered_lines = [line for line in lines if re.search(r">([0-9])", line)]
print(len(filtered_lines))
# 初始化注释变量和最终存储列表
big_annotation = ""
small_annotation = ""
final_lines = []
# 遍历筛选后的行,提取信息
for line in filtered_lines:
# 判断并更新大类注释
if re.search(r">\d\. ", line):
try:
big_annotation = re.search(r">.*?<", line).group(0)
big_annotation = re.split(r"\. ", big_annotation)[1].split("<")[0]
except IndexError:
continue # 忽略可能的分割错误行
# 判断并更新小类注释
elif re.search(r">\d\.\d{1,2} ", line):
try:
small_annotation = re.search(r">.*?<", line).group(0)
small_annotation = re.sub(r"^>\d\.\d{1,2} ", "", small_annotation).split("<")[0]
except IndexError:
continue
# 提取路径信息
elif re.search(r">\d{5}", line):
try:
# 提取路径 ID
element1 = re.search(r">\d{5}", line).group(0)[1:]
if "hsa+pathogen" not in line:
# 提取 Pathway Identifier
element2 = re.search(r"pathway/[a-zA-Z]{2,4}\d{5}", line).group(0).split("/")[1]
# 提取 Pathway 名称
element3_match = re.search(r'<a.*?>(.*?)</a>', line)
element3 = element3_match.group(1) if element3_match else ""
else:
element2 = "organism:hsa+pathogen"
element3_match = re.search(r'hsa\+pathogen.*?<', line)
element3 = element3_match.group(0).split(">")[1].split("<")[0] if element3_match else ""
# 组合信息并添加到最终列表
final_line = [element1, element2, element3, big_annotation, small_annotation]
final_lines.append(final_line)
except AttributeError:
continue # 忽略无匹配项的行
# 创建 DataFrame 并保存为 Excel
df = pd.DataFrame(final_lines, columns=["ID", "Pathway Identifier", "Pathway", "Big Annotation", "Small Annotation"])
df.to_excel("kegg.xlsx", index=False)