-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathDataAnalysis_For_dataset_02.py
266 lines (245 loc) · 12.2 KB
/
DataAnalysis_For_dataset_02.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import os
import pandas as pd
import numpy as np
import os.path as osp
import datetime
import tqdm
import math
import time
from matplotlib import pyplot as plt
def wish_filter(up_only, time, pool_number, name): # 筛选角色活动祈愿
# 单纯是否为UP监测模式 祈愿时间 祈愿池编号 物品名称
t_formats = ['%Y-%m-%d %H:%M:%S']
# 1 2 3 4 5 6 7 8
# 温迪池 可莉池 公子池 钟离池 阿贝多池 甘雨池 魈池 刻晴池
start_times = ['2020-09-28 00:00:00', '2020-10-20 18:00:00', '2020-11-11 06:00:00', '2020-12-01 18:00:00',
'2020-12-22 18:00:00', '2021-01-12 18:00:00', '2021-02-02 18:00:00', '2021-02-17 18:00:00',
'2021-03-02 18:00:00', '2021-03-17 06:00:00', '2021-04-06 18:00:00', '2021-04-28 07:00:00']
end_times = ['2020-10-18 17:59:59', '2020-11-10 14:59:59', '2020-12-01 15:59:59', '2020-12-22 14:59:59',
'2021-01-12 15:59:59', '2021-02-02 14:59:59', '2021-02-17 15:59:59', '2021-03-02 15:59:59',
'2021-03-16 15:00:00', '2021-04-06 16:00:00', '2021-04-27 14:59:00', '2021-05-18 17:59:00']
up_characters = [['芭芭拉', '菲谢尔', '香菱', '温迪'], ['行秋', '诺艾尔', '砂糖', '可莉'], ['迪奥娜', '北斗', '凝光', '达达利亚'],
['辛焱', '雷泽', '重云', '钟离'], ['菲谢尔', '砂糖', '班尼特', '阿贝多'], ['香菱', '行秋', '诺艾尔', '甘雨'],
['迪奥娜', '北斗', '辛焱', '魈'], ['凝光', '班尼特', '芭芭拉', '刻晴'], ['行秋', '香菱', '重云', '胡桃'],
['砂糖', '雷泽', '诺艾尔', '温迪'], ['菲谢尔', '芭芭拉', '罗莎莉亚', '达达利亚'], ['烟绯', '诺艾尔', '迪奥娜', '钟离']]
def get_time(text_time):
w = None
for t_format in t_formats:
try:
w = datetime.datetime.strptime(text_time, t_format)
except Exception as _:
continue
return w
if up_only: # 仅判断是不是UP角色
for j in range(len(start_times)):
start_time = get_time(start_times[j])
end_time = get_time(end_times[j])
pull_time = get_time(time)
if (pull_time >= start_time) and (pull_time <= end_time):
if name in up_characters[j]: # 是不是UP角色
return 1
return 0
return 0
# 判断是否位于某一角色活动UP池
start_time = get_time(start_times[pool_number - 1])
end_time = get_time(end_times[pool_number - 1])
pull_time = get_time(time)
if (pull_time >= start_time) and (pull_time <= end_time): # 没错是我要的池
return 1
return 0
base_folder = 'GI_gacha_dataset_02'
file_list = os.listdir(base_folder) # 获取数据文件夹下的文件夹
print('分析样本数量:' + str(len(file_list)))
file_names = ['gacha100.csv', 'gacha200.csv', 'gacha301.csv', 'gacha302.csv'] # 新手池/常驻池/角色池/武器池
star_5_distribution = np.zeros([91, 4, 2], dtype=int) # 记录五星分布情况 分别对应四个池 是否UP
star_4_distribution = np.zeros([21, 4, 3], dtype=int) # 记录四星分布情况 分别对应四个池 [UP数量 非UP但是同类型数量 其他数量]
default_header = ['抽卡时间', '编号', '名称', '类别', '星级']
gacha_time_5 = 0
gacha_time_4 = 0
all_raw_pull = 0
max_5_star_pull = 0
least_gacha_time = 0 # 每个池至少的抽卡数量
ignore_5_star = 0 # 每个池略去前几个五星
ignore_4_star = 0 # 每个池略去前几个四星
pure_4_star_model = 0 # 设为1时用于分析四星模型,若四星中途抽到五星则跳过
pool_select = 0 # 零表示不进行指定UP池选择 有数字代表选择某一个池
pool_list = [1, ] # 选择的UP池
for i in tqdm.tqdm(file_list): # progressBar
folder_paths = [base_folder, i]
folder_path = osp.join(*folder_paths)
for j in range(4): # 四个池子
# if j != 1: # 研究常驻池
# continue
# if j != 2: # 研究角色池
# continue
file_name = file_names[j]
processing_file = osp.join(base_folder, str(i).rjust(4, '0'), file_name)
if os.path.exists(processing_file):
try:
data = pd.read_csv(processing_file)
except: # 文件为空
continue
else:
continue
if max(data.index) < least_gacha_time: # 略去量少数据
continue
counter_5 = 0 # 抽取计数器
first_5 = ignore_5_star # 取消刷初始号偏差,需要略去的前几个出五星数量
first_4 = ignore_4_star
counter_4 = 0
been_5 = 0 # 四星中间是否有五星
for index, row in data.iterrows():
all_raw_pull += 1
counter_4 += 1
counter_5 += 1
this_star = data.iloc[index].values[3]
if this_star == 4: # 这次是四星
if been_5 and pure_4_star_model: # 特殊分析时,中间有五星,就略过本次
counter_4 = 0
been_5 = 0
continue
if first_4 > 0: # 消除初始号影响
first_4 -= 1
counter_4 = 0
continue
# 筛选UP池时发现不是这个池子
if pool_select: # 开启了UP池筛选
check_select_mark = 0
for pool_num in pool_list:
if wish_filter(0, data.iloc[index].values[0], pool_num, 'NULL'):
check_select_mark = 1
if check_select_mark == 0:
counter_4 = 0
continue
if counter_4 >= 11: # 小概率事件
print(i + ' ' + file_name + ' ' + str(index) + ' 四星间隔为11')
if counter_4 >= 12: # 极低概率事件
print(i)
print(counter_4)
print('四星间隔超出12,需要检查')
if data.iloc[index].values[2] == '武器':
star_4_distribution[counter_4][j][2] += 1
if data.iloc[index].values[2] == '角色':
if j == 1: # 常驻池
star_4_distribution[counter_4][j][1] += 1
elif wish_filter(1, data.iloc[index].values[0], 0, data.iloc[index].values[1]):
# 是UP角色
star_4_distribution[counter_4][j][0] += 1
else: # 非UP四星角色
star_4_distribution[counter_4][j][1] += 1
gacha_time_4 += counter_4 # 记录本次所用抽数
counter_4 = 0
been_5 = 0
if this_star == 5:
max_5_star_pull = max(max_5_star_pull, counter_5)
been_5 = 1
if first_5 > 0: # 消除初始号影响
first_5 -= 1
counter_5 = 0
continue
if data.iloc[index].values[2] == '武器': # 试验性
star_5_distribution[counter_5][j][1] += 1
elif wish_filter(1, data.iloc[index].values[0], 0, data.iloc[index].values[1]):
# 是UP角色
star_5_distribution[counter_5][j][0] += 1
else:
star_5_distribution[counter_5][j][1] += 1
gacha_time_5 += counter_5
counter_5 = 0
def produce_var(star, gacha_data, check_p):
tot = 0
for k in range(1, len(gacha_data)):
tot += k * gacha_data[k]
data_mean = tot / sum(gacha_data)
s_2 = 0
for k in range(1, len(gacha_data)):
s_2 += gacha_data[k] * (k - data_mean) ** 2
s_2 = s_2 / (sum(gacha_data) - 1)
stander_check = (data_mean - 1 / check_p) / math.sqrt(s_2 / sum(gacha_data))
print('===' + str(star) + '星分析===')
print('样本量' + str(sum(gacha_data)))
print('样本均值' + str(data_mean))
print('样本平均概率' + str(1 / data_mean))
print('样本方差' + str(s_2))
print('转为01正态的参考值' + str(stander_check)) # 假设检验量,此方法数学上不严格,仅供参考
print('原始数据统计总抽数:' + str(all_raw_pull))
need_4 = np.sum(np.sum(star_4_distribution[0:12, 1:3, :], axis=2), axis=1) # 选取标准池和角色池
need_5 = np.sum(np.sum(star_5_distribution[0:91, 1:3, :], axis=2), axis=1) # 选取标准池和角色池
# 统计量分析
produce_var(4, need_4, 0.13)
produce_var(5, need_5, 0.016)
# 这部分是我分析四星时随意写的,之后会改这些乱七八糟的玩意
# print('四星数量: ' + str(need_4.sum()))
# # print(temp)
# # print(*(need_4[1:12]), sep='\t')
# print('UP四星角色')
# need_4 = np.sum(np.sum(star_4_distribution[0:12, 2:3, 0:1], axis=2), axis=1) # 选取角色池
# print(*(need_4[1:12]), sep='\t')
# print('四星武器')
# need_4 = np.sum(np.sum(star_4_distribution[0:12, 2:3, 2:3], axis=2), axis=1) # 选取角色池
# print(*(need_4[1:12]), sep='\t')
# print('其他四星角色')
# need_4 = np.sum(np.sum(star_4_distribution[0:12, 2:3, 1:2], axis=2), axis=1) # 选取角色池
# print(*(need_4[1:12]), sep='\t')
#
# print('常驻四星角色')
# need_4 = np.sum(np.sum(star_4_distribution[0:12, 1:2, 1:2], axis=2), axis=1) # 选取标准池
# print(*(need_4[1:12]), sep='\t')
# print('常驻四星武器')
# need_4 = np.sum(np.sum(star_4_distribution[0:12, 1:2, 2:3], axis=2), axis=1) # 选取标准池
# print(*(need_4[1:12]), sep='\t')
#
# print('五星数量: ' + str(need_5.sum()))
# print(*(need_5[1:91]), sep='\t')
print('抽到五星所用的最多抽数:'+str(max_5_star_pull))
def plot_5_star_compare_graph(x, weapon_pool):
P_5 = np.zeros(91, dtype=float)
Expect_distribution_5 = np.zeros(91, dtype=float)
state_P = 1
expect_pull_time = 0 # 期望抽卡数
base_P = 0.006 # 基础概率
pity_begin = 74 # 保底开始位置
guarantee_pull = 90 # 一定能抽到
file_text = 'stander&character'
max_pull = 0
if weapon_pool: # 武器池的话 这里用的是一段模型,差距不大画图够用就还没改
base_P = 0.007
pity_begin = 63
guarantee_pull = 80
file_text = 'weapon'
for i in range(1, 91): # 根据二测数据的修正模型
P_5[i] = base_P # 保底前概率
if i >= pity_begin: # 概率增长段
P_5[i] = base_P + base_P*10 * (i - pity_begin + 1)
if i == guarantee_pull: # 硬保底
P_5[i] = 1
Expect_distribution_5[i] = state_P * P_5[i]
expect_pull_time += Expect_distribution_5[i]*i
state_P = state_P * (1 - P_5[i]) # 下个状态的概率
tot = 0 # 此处用于计算样本概率的无偏估计量
for k in range(1, len(x)):
tot += k * x[k]
if x[k]:
max_pull = k
data_mean = tot / sum(x)
# if weapon_pool == 0:
plt.plot(range(1, guarantee_pull+1), Expect_distribution_5[1:guarantee_pull+1], label='theory')
plt.plot(range(1, guarantee_pull+1), x[1:guarantee_pull+1] / sum(x[1:guarantee_pull+1]),
label='actual situation in dataset_02')
plt.title(file_text+' 5 star distribution')
plt.legend(loc="upper left")
plt.text(15, 0.06, '5star sample number:' + str(sum(x[1:guarantee_pull+1])) + '\n' +
'theory probability:'+str(round(100/expect_pull_time, 4))+'%' + '\n' +
'sample probability:'+str(round(100/data_mean, 4))+'%' + '\n' +
'max pull:' + str(max_pull)+ '\n' +
'plot time:' + time.asctime(time.localtime(time.time())),
verticalalignment="top", horizontalalignment="left") #
plt.savefig('plot_graph\\5star_distribution_'+file_text+'.png')
plt.show()
# 绘制五星分布
need_5 = np.sum(np.sum(star_5_distribution[0:91, 1:3, :], axis=2), axis=1) # 选取标准池和角色池
plot_5_star_compare_graph(need_5, 0) # 标准池和角色池
need_5 = np.sum(np.sum(star_5_distribution[0:91, 3:4, :], axis=2), axis=1) # 选取武器池
# print('武器池五星数量'+str(need_5.sum()))
# print(*(need_5[1:91]), sep='\t')
plot_5_star_compare_graph(need_5, 1) # 武器池