-
Notifications
You must be signed in to change notification settings - Fork 0
/
title_sim.py
107 lines (92 loc) · 2.96 KB
/
title_sim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# 实现个性化推荐的脚本
__author__ = 'ZYC@BUPT'
import random
import jieba
import sys
import json
jieba.load_userdict("E:/PYC/news-test/newword.dict")
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import math
import os
def Test2(rootDir):
global title,mindic
mindic=2
title = {'title':'','url':''}
for lists in os.listdir(rootDir):
path = os.path.join(rootDir, lists)
if path.find(".json")!=-1:
res=static(path)
if res==-1:
return 0
if os.path.isdir(path):
Test2(path)
def static(path):
global title,mindic
try:
fp=open(path,"r")
allstr=fp.read()
new_dic=json.loads(allstr)
except IOError:
return 0
for ne in new_dic['article']:
segcont=[ostr]
str_title=""
seg_title = jieba.lcut(ne['title'], cut_all=True)
for se in seg_title:
str_title=str_title+" "+se
segcont.append(str_title)
global weight
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(segcont))
word = vectorizer.get_feature_names() # 所有文本的关键字
weight = tfidf.toarray() # 对应的tfidf矩阵
dis=dist(weight[0],weight[1])
if dis < mindic and (ne['title'] in js_title)==False and dis>1.1:
title['title']=ne['title']
title['url']=ne['url']
mindic=dis
if mindic<=1.30:
return -1
def dist(a, b):
return math.sqrt(np.power(a - b, 2).sum())
global ostr,title,mindic
global js_title
title={}
js_title={}
ostr = sys.argv[1]
#ostr="专家汇聚把脉兰州牛肉拉面 献策打造“中华第一面”"
#ostr="最高法常务副院长:如何适用正当防卫制度...阿富汗总统加尼会见王毅...辽宁舰编队今日出海 将执行跨区机动训练任务"
ostr=ostr.replace("NULL","")
ostr=ostr.replace('[','["')
ostr=ostr.replace(']','"]')
arr=json.loads(ostr)
ostr=arr[0]
part=ostr.split("...")
for pa in part:
seg_list = jieba.lcut(pa, cut_all=False)
ostr = ""
for se in seg_list:
ostr = ostr + " " + se
Test2("C:/xampp/htdocs/NewsFeed/json/tfidf")
js_title[title['title']] = title['url']
count=len(js_title)
if count<10:
file=os.listdir("C:/xampp/htdocs/NewsFeed/json/tfidf")
while count<10:
fil_len=len(file)
em_pla=random.randint(0,fil_len-1)
em_fp=open("C:/xampp/htdocs/NewsFeed/json/tfidf/"+file[em_pla],'r')
em_str=em_fp.read()
em_dic=json.loads(em_str)
em_len=len(em_dic['article'])
em_pla = random.randint(0, em_len - 1)
js_title[em_dic['article'][em_pla]['title']] = em_dic['article'][em_pla]['url']
count+=1
em_fp.close()
re_str=json.dumps(js_title)
print(re_str)