xiaofu/data.py at main · fujin6399-bit/xiaofu · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
# building_materials_processor.py
"""
建材数据处理完整脚本
功能：清洗、分析、分类、导出Reddit建材数据
作者：基于Reddit抓取数据的处理需求
"""

import pandas as pd
import numpy as np
from datetime import datetime
import re
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

def load_and_inspect_data(filename='building_materials_data.csv'):
    """加载数据并检查基本信息"""
    print("=" * 60)
    print("📊 第一步：加载并检查数据")
    print("=" * 60)

    df = pd.read_csv(filename)

    print(f"总数据量: {len(df)} 条")
    print(f"列名: {df.columns.tolist()}")
    print(f"\n各列缺失情况:")
    print(df.isnull().sum())
    print(f"\n数据类型:")
    print(df.dtypes)
    print(f"\n内存占用: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    return df

def clean_data(df):
    """数据清洗"""
    print("\n" + "=" * 60)
    print("🧹 第二步：数据清洗")
    print("=" * 60)

    original_count = len(df)

    # 1. 处理时间列
    if 'Created Time' in df.columns:
        try:
            df['Created Time'] = pd.to_datetime(df['Created Time'], unit='s')
        except:
            df['Created Time'] = pd.to_datetime(df['Created Time'], errors='coerce')

        # 提取时间特征
        df['Year'] = df['Created Time'].dt.year
        df['Month'] = df['Created Time'].dt.month
        df['Month_Name'] = df['Created Time'].dt.strftime('%Y-%m')
        df['Day_of_Week'] = df['Created Time'].dt.day_name()
        df['Hour'] = df['Created Time'].dt.hour

    # 2. 清理文本内容
    def clean_text(text):
        if pd.isna(text):
            return ""
        text = str(text)
        text = re.sub(r'http\S+', '', text)  # 移除URL
        text = re.sub(r'\s+', ' ', text)  # 合并多余空格
        text = text.strip()
        return text

    df['Title_Clean'] = df['Title'].apply(clean_text)
    df['Content_Clean'] = df['Content'].apply(clean_text)

    # 3. 合并标题和内容
    df['Full_Text'] = df['Title_Clean'] + ' ' + df['Content_Clean']

    # 4. 去重
    print(f"去重前: {len(df)} 条")
    df = df.drop_duplicates(subset=['Title'], keep='first')
    print(f"去重后: {len(df)} 条")

    # 5. 过滤低质量数据
    df = df[df['Title_Clean'].str.len() > 10]
    df = df[df['Upvotes'] >= 0]

    # 6. 处理评论字段
    if 'Comments Detail' in df.columns:
        df['Comments Detail'] = df['Comments Detail'].apply(
            lambda x: str(x) if not pd.isna(x) else ""
        )
        df['Comment_Count'] = df['Comments Detail'].apply(
            lambda x: len(x.split(';')) if x else 0
        )

    print(f"✅ 清洗完成：{original_count} → {len(df)} 条（移除 {original_count - len(df)} 条）")

    return df

def categorize_materials(df):
    """建材类型分类"""
    print("\n" + "=" * 60)
    print("🏗️ 第三步：建材类型识别")
    print("=" * 60)

    # 建材关键词字典
    material_keywords = {
        '木材类': ['lumber', 'wood', 'timber', 'plywood', 'hardwood', 'softwood', 'mdf', 'osb'],
        '金属类': ['metal', 'steel', 'aluminum', 'aluminium', 'iron', 'copper', 'zinc', 'brass'],
        '板材类': ['panel', 'board', 'sheet', 'plate', 'slab', 'drywall', 'sheetrock'],
        '型材类': ['profile', 'beam', 'channel', 'angle', 'tube', 'pipe', 'rod', 'bar'],
        '复合材料': ['composite', 'laminate', 'fiberglass', 'pvc', 'vinyl', 'acrylic'],
        '石材类': ['concrete', 'cement', 'brick', 'stone', 'tile', 'marble', 'granite', 'mortar']
    }

    def categorize_material(text):
        text_lower = text.lower()
        categories = []
        for category, keywords in material_keywords.items():
            if any(keyword in text_lower for keyword in keywords):
                categories.append(category)
        return ', '.join(categories) if categories else '未分类'

    df['Material_Type'] = df['Full_Text'].apply(categorize_material)

    print("建材类型分布:")
    print(df['Material_Type'].value_counts().head(10))

    return df

def extract_value_signals(df):
    """提取高价值信息信号"""
    print("\n" + "=" * 60)
    print("💎 第四步：提取信息差机会信号")
    print("=" * 60)

    # 1. 价格相关
    price_keywords = ['price', 'cost', 'expensive', 'cheap', 'discount', 'sale',
                      'budget', 'afford', 'markup', 'wholesale']
    df['Has_Price_Info'] = df['Full_Text'].str.lower().apply(
        lambda x: any(word in x for word in price_keywords)
    )

    # 2. 供应链问题
    supply_keywords = ['shortage', 'delay', 'backorder', 'out of stock',
                       'supply chain', 'lead time', 'unavailable', 'waiting']
    df['Supply_Issue'] = df['Full_Text'].str.lower().apply(
        lambda x: any(word in x for word in supply_keywords)
    )

    # 3. 质量问题
    quality_keywords = ['defect', 'quality', 'problem', 'issue', 'broken',
                        'damaged', 'warranty', 'return']
    df['Quality_Issue'] = df['Full_Text'].str.lower().apply(
        lambda x: any(word in x for word in quality_keywords)
    )

    # 4. 地域信息
    regions = ['US', 'USA', 'Canada', 'UK', 'Europe', 'Asia', 'China',
               'California', 'Texas', 'New York', 'Australia']
    def extract_region(text):
        text_upper = text.upper()
        found = [r for r in regions if r.upper() in text_upper]
        return ', '.join(found) if found else 'Unknown'

    df['Region'] = df['Full_Text'].apply(extract_region)

    # 5. 计算参与度指标
    df['Engagement_Score'] = df['Upvotes'] + df['Comments'] * 2

    print(f"有价格信息: {df['Has_Price_Info'].sum()} 条")
    print(f"有供应链问题: {df['Supply_Issue'].sum()} 条")
    print(f"有质量问题: {df['Quality_Issue'].sum()} 条")

    return df

def create_visualizations(df):
    """生成可视化图表"""
    print("\n" + "=" * 60)
    print("📊 第五步：生成可视化图表")
    print("=" * 60)

    # 设置中文字体（根据系统选择）
    try:
        plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']  # Mac
    except:
        try:
            plt.rcParams['font.sans-serif'] = ['SimHei']  # Windows
        except:
            pass

    plt.rcParams['axes.unicode_minus'] = False

    # 1. 月度趋势
    if 'Month_Name' in df.columns:
        plt.figure(figsize=(12, 5))
        monthly_trend = df['Month_Name'].value_counts().sort_index()
        monthly_trend.plot(kind='bar', color='steelblue')
        plt.title('Monthly Post Trend', fontsize=14, fontweight='bold')
        plt.xlabel('Month')
        plt.ylabel('Number of Posts')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig('monthly_trend.png', dpi=300, bbox_inches='tight')
        print("✓ 月度趋势图已保存: monthly_trend.png")
        plt.close()

    # 2. 子版块活跃度
    plt.figure(figsize=(10, 6))
    df['Subreddit'].value_counts().head(10).plot(kind='barh', color='coral')
    plt.title('Top 10 Most Active Subreddits', fontsize=14, fontweight='bold')
    plt.xlabel('Number of Posts')
    plt.tight_layout()
    plt.savefig('subreddit_activity.png', dpi=300, bbox_inches='tight')
    print("✓ 子版块活跃度图已保存: subreddit_activity.png")
    plt.close()

    # 3. 建材类型分布
    plt.figure(figsize=(10, 6))
    material_counts = df['Material_Type'].str.split(', ').explode().value_counts().head(10)
    material_counts.plot(kind='barh', color='seagreen')
    plt.title('Top 10 Material Types', fontsize=14, fontweight='bold')
    plt.xlabel('Mentions')
    plt.tight_layout()
    plt.savefig('material_types.png', dpi=300, bbox_inches='tight')
    print("✓ 建材类型分布图已保存: material_types.png")
    plt.close()

    # 4. 参与度分布
    plt.figure(figsize=(10, 5))
    plt.hist(df['Engagement_Score'], bins=50, color='purple', alpha=0.7, edgecolor='black')
    plt.title('Engagement Score Distribution', fontsize=14, fontweight='bold')
    plt.xlabel('Engagement Score')
    plt.ylabel('Frequency')
    plt.axvline(df['Engagement_Score'].median(), color='red', linestyle='--',
                label=f'Median: {df["Engagement_Score"].median():.0f}')
    plt.legend()
    plt.tight_layout()
    plt.savefig('engagement_distribution.png', dpi=300, bbox_inches='tight')
    print("✓ 参与度分布图已保存: engagement_distribution.png")
    plt.close()

def export_results(df):
    """导出处理后的数据"""
    print("\n" + "=" * 60)
    print("💾 第六步：导出数据")
    print("=" * 60)

    # 1. 保存完整清洗后数据
    df.to_csv('building_materials_cleaned.csv', index=False, encoding='utf-8')
    print(f"✓ 清洗后完整数据: building_materials_cleaned.csv ({len(df)}条)")

    # 2. 高价值机会
    valuable_posts = df[
        (df['Has_Price_Info'] == True) |
        (df['Supply_Issue'] == True) |
        (df['Engagement_Score'] > df['Engagement_Score'].quantile(0.75))
    ].sort_values('Engagement_Score', ascending=False)

    valuable_posts.to_csv('high_value_opportunities.csv', index=False, encoding='utf-8')
    print(f"✓ 高价值机会数据: high_value_opportunities.csv ({len(valuable_posts)}条)")

    # 3. 按建材类型导出
    for material_type in ['木材类', '金属类', '板材类', '型材类']:
        subset = df[df['Material_Type'].str.contains(material_type, na=False)]
        if len(subset) > 10:  # 只导出数量足够的类型
            filename = f'{material_type}_data.csv'
            subset.to_csv(filename, index=False, encoding='utf-8')
            print(f"✓ {material_type}数据: {filename} ({len(subset)}条)")

    # 4. 供应链问题专题
    supply_issues = df[df['Supply_Issue'] == True].sort_values('Engagement_Score', ascending=False)
    if len(supply_issues) > 0:
        supply_issues.to_csv('supply_chain_issues.csv', index=False, encoding='utf-8')
        print(f"✓ 供应链问题数据: supply_chain_issues.csv ({len(supply_issues)}条)")

    return valuable_posts

def generate_report(df, valuable_posts):
    """生成分析报告"""
    print("\n" + "=" * 60)
    print("📄 第七步：生成分析报告")
    print("=" * 60)

    report = f"""
{'='*70}
建材数据处理与分析报告
{'='*70}
生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

一、数据概览
{'='*70}
总数据量: {len(df)} 条
高价值机会: {len(valuable_posts)} 条

时间范围:
  最早: {df['Created Time'].min().strftime('%Y-%m-%d')}
  最晚: {df['Created Time'].max().strftime('%Y-%m-%d')}

二、子版块活跃度 TOP 5
{'='*70}
{df['Subreddit'].value_counts().head(5).to_string()}

三、热门建材类型 TOP 5
{'='*70}
{df['Material_Type'].value_counts().head(5).to_string()}

四、关键机会指标
{'='*70}
价格相关帖子: {df['Has_Price_Info'].sum()} 条 ({df['Has_Price_Info'].sum()/len(df)*100:.1f}%)
供应链问题帖子: {df['Supply_Issue'].sum()} 条 ({df['Supply_Issue'].sum()/len(df)*100:.1f}%)
质量问题帖子: {df['Quality_Issue'].sum()} 条 ({df['Quality_Issue'].sum()/len(df)*100:.1f}%)

五、参与度统计
{'='*70}
平均 Upvotes: {df['Upvotes'].mean():.1f}
平均评论数: {df['Comments'].mean():.1f}
平均参与度得分: {df['Engagement_Score'].mean():.1f}

最高 Upvotes: {df['Upvotes'].max()}
最多评论: {df['Comments'].max()}

六、TOP 10 高互动帖子（潜在机会）
{'='*70}
"""

    top_10 = df.nlargest(10, 'Engagement_Score')[['Title', 'Subreddit', 'Upvotes', 'Comments', 'Material_Type']]
    for idx, row in top_10.iterrows():
        report += f"\n{row['Title'][:70]}\n"
        report += f"  └─ {row['Subreddit']} | ↑{row['Upvotes']} 💬{row['Comments']} | {row['Material_Type']}\n"

    report += f"\n{'='*70}\n"

    # 保存报告
    with open('data_analysis_report.txt', 'w', encoding='utf-8') as f:
        f.write(report)

    print(report)
    print("✓ 完整报告已保存: data_analysis_report.txt")

def show_key_opportunities(df):
    """展示关键机会"""
    print("\n" + "=" * 60)
    print("🎯 关键机会速览")
    print("=" * 60)

    # 1. 供应链短缺
    shortage_posts = df[df['Supply_Issue'] == True].nlargest(5, 'Engagement_Score')
    if len(shortage_posts) > 0:
        print("\n⚠️ TOP 5 供应链问题（套利机会）:")
        for idx, row in shortage_posts.iterrows():
            print(f"  • {row['Title'][:60]}...")
            print(f"    └─ {row['Subreddit']} | ↑{row['Upvotes']} 💬{row['Comments']} | {row['Material_Type']}")

    # 2. 价格讨论
    price_posts = df[df['Has_Price_Info'] == True].nlargest(5, 'Engagement_Score')
    if len(price_posts) > 0:
        print("\n💰 TOP 5 价格相关（价差机会）:")
        for idx, row in price_posts.iterrows():
            print(f"  • {row['Title'][:60]}...")
            print(f"    └─ {row['Material_Type']} | ↑{row['Upvotes']} 💬{row['Comments']}")

    # 3. 最近热门
    recent = df[df['Created Time'] > (datetime.now() - pd.Timedelta(days=30))]
    if len(recent) > 0:
        print(f"\n🔥 最近30天热门 ({len(recent)}条):")
        for idx, row in recent.nlargest(3, 'Engagement_Score').iterrows():
            print(f"  • {row['Title'][:60]}...")
            print(f"    └─ {row['Created Time'].strftime('%Y-%m-%d')} | {row['Subreddit']}")

def main():
    """主函数"""
    print("\n" + "🚀" * 30)
    print("建材数据处理系统启动")
    print("🚀" * 30 + "\n")

    try:
        # 执行所有步骤
        df = load_and_inspect_data('building_materials_data.csv')
        df = clean_data(df)
        df = categorize_materials(df)
        df = extract_value_signals(df)
        create_visualizations(df)
        valuable_posts = export_results(df)
        generate_report(df, valuable_posts)
        show_key_opportunities(df)

        print("\n" + "✅" * 30)
        print("数据处理完成！")
        print("✅" * 30)

        print("\n生成的文件:")
        print("  📊 可视化图表: monthly_trend.png, subreddit_activity.png, material_types.png, engagement_distribution.png")
        print("  💾 数据文件: building_materials_cleaned.csv, high_value_opportunities.csv, supply_chain_issues.csv")
        print("  📄 分析报告: data_analysis_report.txt")

    except FileNotFoundError:
        print("❌ 错误：找不到 building_materials_data.csv 文件")
        print("请确保数据文件在当前目录下")
    except Exception as e:
        print(f"❌ 处理过程中出现错误: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()