NOTE that: the plot show that 'Mesomycoplasma hyorhinis' exists only in 'study zhang'(32 samples from Zhang et al. 2019). But you can find it's taxid is 2100 from the output(like from
c3 = left_join(c, c2, by = 'name') %>%
left_join(select(kr, rank, name) %>% distinct()) %>%
subset(r1>0 & r2>0 & r3>0 & p<0.05 & p1<0.05 & p2<0.05 & p3<0.05 & rank == 'S')
c3
# A tibble: 4 × 11
# Groups: taxid [4]
taxid r p name r1 r2 r3 p1 p2 p3 rank
<int> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
1 210 1 0 Helic… 0.990 0.897 0.915 1.27e-30 1.34e-13 5.80e-15 S
2 1280 1 1.85e-32 Staph… 0.769 0.805 0.855 2.49e- 7 1.12e- 9 7.94e-12 S
3 1491 1 4.96e- 5 Clost… 0.995 0.955 0.957 0 1.45e-20 5.78e-21 S
4 2100 1 0 Mycop… 0.967 0.753 0.823 4.02e-12 1.28e- 4 8.31e- 6 S
) and it does exit in cell.line.txt but it's name is "Mesomycoplasma hyorhinis" instead of "Mesomycoplasma hyorhinis":
"1780" "DRR099206" "S" 2100 "Mesomycoplasma hyorhinis" 1 16 12 0.16951652192781 109.98680158381
...
"9406" "DRR212593" "S" 2100 "Mesomycoplasma hyorhinis" 1 74 28 0.0359598167746648 28.3446712018141
...
"10385" "DRR326767" "S" 2100 "Mesomycoplasma hyorhinis" 3 5 1 0.157024022110657 125.099036737417
...(other thousands of items)
It's a common question cauz for the same species with the same taxid that they have various names(or the name used in cell.line.txt is not so standard and uniformed as the name in kraken.report.txt). But their taxid is necessarily the same.
So I suggest to rectify the code for step 6 listed in the README to:
# 6. Identifying contaminants and false positives (cell line quantile test)
# combine k-mer correlation tests, filter for significant values and species resolution
c3 = left_join(c, c2, by = 'name') %>%
left_join(select(kr, rank, name) %>% distinct()) %>%
subset(r1>0 & r2>0 & r3>0 & p<0.05 & p1<0.05 & p2<0.05 & p3<0.05 & rank == 'S')
print(c3)
cat('/n')
library(scales)
# cell.lines = readxl::read_xlsx('Table S4.xlsx')
# 'cell.lines.txt' 已经放在 /home/songkang/SAHMI-main/cell.lines.txt 目录下
cell.lines = read.delim(opt$cell_lines , sep = " ", header = T) %>% tibble()
df = cell.lines[,1:9] %>% mutate(study = 'cell lines')
# df = df[, -2]
df = rbind(df, kr)
# 定义绘图函数
plot_density <- function(df_subset) {
ggplot(df_subset, aes(rpmm, fill = study, ..scaled..)) +
geom_density(alpha = 0.5, color = NA) +
facet_wrap(~name, scales = 'free') +
scale_fill_manual(values = c('cell lines' = 'grey50', 'chen' = 'navyblue')) +
theme_minimal() +
xlab('Microbiome reads per million') +
ylab('Density') +
scale_x_log10(labels = trans_format("log10", math_format(10^.x)),
breaks = trans_breaks("log10", function(x) 10^x, n = 4),
oob = scales::squish, expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
theme(legend.title = element_blank(),
panel.border = element_rect(fill = NA, color = 'black'),
strip.background = element_blank(),
axis.ticks.x = element_line(size = 0),
axis.ticks.y = element_blank(),
panel.grid = element_blank(),
strip.text = element_text(color = 'black', size = 10),
axis.text.x = element_text(color = 'black', size = 10),
axis.text.y = element_text(color = 'black', size = 10),
axis.title.y = element_text(color = 'black', size = 10),
axis.title.x = element_text(color = 'black', size = 10),
plot.margin = unit(c(0, 0.1, 0, 0), "cm"),
legend.key.size = unit(0.2, "cm"),
legend.text = element_text(color = 'black', size = 10),
legend.position = 'bottom')
}
# 分批绘制图形
for (i in seq(1, length(c3$taxid), by = 16)) {
df_subset <- subset(df, taxid %in% c3$taxid[i:min(i + 15, length(c3$taxid))])
# plot_density(df_subset)
print(ggplot(df_subset, aes(rpmm, fill = study, ..scaled..)) +
geom_density(alpha = 0.5, color = NA) +
# facet_wrap(~name, scales = 'free') +
facet_wrap(~taxid, scales = 'free') +
scale_fill_manual(values = c('cell lines' = 'grey50', 'chen' = 'navyblue')) +
theme_minimal() +
xlab('Microbiome reads per million') +
ylab('Density') +
scale_x_log10(labels = trans_format("log10", math_format(10^.x)),
breaks = trans_breaks("log10", function(x) 10^x, n = 4),
oob = scales::squish, expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
theme(legend.title = element_blank(),
panel.border = element_rect(fill = NA, color = 'black'),
strip.background = element_blank(),
axis.ticks.x = element_line(size = 0),
axis.ticks.y = element_blank(),
panel.grid = element_blank(),
strip.text = element_text(color = 'black', size = 10),
axis.text.x = element_text(color = 'black', size = 10),
axis.text.y = element_text(color = 'black', size = 10),
axis.title.y = element_text(color = 'black', size = 10),
axis.title.x = element_text(color = 'black', size = 10),
plot.margin = unit(c(0, 0.1, 0, 0), "cm"),
legend.key.size = unit(0.2, "cm"),
legend.text = element_text(color = 'black', size = 10),
legend.position = 'bottom'))
# 关键代码,生成 检测出的菌群图
# ggsave(paste0("result_density_plot_", i, ".png"), plot = last_plot())
}
# get quantiles from cell line data (or alternatively use Table S4)
qtile = 0.99
q_df = cell.lines %>%
# group_by(name, rank) %>%
group_by(taxid, rank) %>%
summarize(CLrpmm = 10^quantile(log10(rpmm), qtile, na.rm = T),
.groups = 'keep')
# 保存表格为.tsv格式
# write.table(left_join(c3, q_df, by = c('name', 'rank')) %>%
write.table(left_join(c3, q_df, by = c('taxid', 'rank')) %>%
left_join(subset(kr, sample == opt$sample) %>% select(name, rpmm), by = c('name')),
file = "taxa.tsv", # 指定文件名
sep = "\t", # 指定分隔符为制表符
row.names = FALSE) # 不保存行名
and the result should be(I suggest use taxid in the plot insead of its long name):

And indeed "cell.lines.txt" or "Table S4" don't include some taxas , as mentioned in #7 .(or you can even say 'many taxas' instead of 'some taxas'). And I think you should for sure consider that the taxa present in the sample is truly positive. And I encounter many such cases while I use it.
But the taxa 'Mesomycoplasma hyorhinis'(taxid :2100) is for sure included in "cell.lines.txt" or "Table S4".
Take the ./example data/SRR9713132 in the README as example:

the result of the author is(reproduce by myself):
NOTE that: the plot show that 'Mesomycoplasma hyorhinis' exists only in 'study zhang'(32 samples from Zhang et al. 2019). But you can find it's taxid is 2100 from the output(like from
) and it does exit in cell.line.txt but it's name is "Mesomycoplasma hyorhinis" instead of "Mesomycoplasma hyorhinis":
It's a common question cauz for the same species with the same taxid that they have various names(or the name used in cell.line.txt is not so standard and uniformed as the name in kraken.report.txt). But their taxid is necessarily the same.
So I suggest to rectify the code for step 6 listed in the README to:
(I use 'taxid' instead of 'name' to join the table)
and the result should be(I suggest use taxid in the plot insead of its long name):

And indeed "cell.lines.txt" or "Table S4" don't include some taxas , as mentioned in #7 .(or you can even say 'many taxas' instead of 'some taxas'). And I think you should for sure consider that the taxa present in the sample is truly positive. And I encounter many such cases while I use it.
But the taxa 'Mesomycoplasma hyorhinis'(taxid :2100) is for sure included in "cell.lines.txt" or "Table S4".