Skip to content

Commit 16b21c0

Browse files
author
Alberto Paro
committed
Added contrib mailman indexing
1 parent 6ec2787 commit 16b21c0

File tree

2 files changed

+172
-1
lines changed

2 files changed

+172
-1
lines changed

README.rst

+3-1
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,9 @@ v. 0.14.0: Added delete of mapping type.
150150

151151
Some code cleanup.
152152

153-
Added reindex by query (usable only with my git branch).
153+
Added reindex by query (usable only with my elasticsearch git branch).
154+
155+
Added contrib with mailman indexing.
154156

155157
v. 0.13.1: Added jython support (HTTP only for now).
156158

contrib/mailman/archive_and_index.py

+169
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
#!/usr/bin/env python
2+
#
3+
# Copyright (C) 2010 by the Free Software Foundation, Inc.
4+
#
5+
# This program is free software; you can redistribute it and/or
6+
# modify it under the terms of the GNU General Public License
7+
# as published by the Free Software Foundation; either version 2
8+
# of the License, or (at your option) any later version.
9+
#
10+
# This program is distributed in the hope that it will be useful,
11+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
# GNU General Public License for more details.
14+
#
15+
# You should have received a copy of the GNU General Public License
16+
# along with this program; if not, write to the Free Software
17+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
18+
# USA.
19+
20+
"""This is a template for constructing an external archiver for situations
21+
where one wants to archive posts in Mailman's pipermail archive, but also
22+
wants to invoke some other process on the archived message after its URL
23+
and/or path are known.
24+
25+
It assumes this is invoked by mm_cfg.py settings like
26+
PUBLIC_EXTERNAL_ARCHIVER = '/path/to/Ext_Arch.py %(hostname)s %(listname)s'
27+
PRIVATE_EXTERNAL_ARCHIVER = '/path/to/Ext_Arch.py %(hostname)s %(listname)s'
28+
29+
The path in the sys.path.insert() below must be adjusted to the actual path
30+
to Mailman's bin/ directory, or you can simply put this script in Mailman's
31+
bin/ directory and it will work without the sys.path.insert() and of course,
32+
you must add the code you want to the ext_process function.
33+
"""
34+
35+
import sys
36+
sys.path.insert(0, '/usr/local/mailman/bin') # path to your mailman dir
37+
import paths
38+
39+
import os
40+
import email
41+
import time
42+
43+
from cStringIO import StringIO
44+
45+
from Mailman import Message
46+
from Mailman import MailList
47+
from Mailman.Archiver import HyperArch
48+
from Mailman.Logging.Syslog import syslog
49+
from Mailman.Logging.Utils import LogStdErr
50+
51+
# For debugging, log stderr to Mailman's 'debug' log
52+
LogStdErr('debug', 'mailmanctl', manual_reprime=0)
53+
54+
def ext_process(listname, hostname, url, filepath, msg):
55+
"""Here's where you put your code to deal with the just archived message.
56+
57+
Arguments here are the list name, the host name, the URL to the just
58+
archived message, the file system path to the just archived message and
59+
the message object.
60+
61+
These can be replaced or augmented as needed.
62+
"""
63+
from pyes import ES
64+
from pyes.exceptions import ClusterBlockException, NoServerAvailable
65+
import datetime
66+
67+
#CHANGE this settings to reflect your configuration
68+
_ES_SERVERS = ['127.0.0.1:9500'] # I prefer thrift
69+
_indexname = "mailman"
70+
_doctype = "mail"
71+
date = datetime.datetime.today()
72+
73+
try:
74+
iconn = ES(_ES_SERVERS)
75+
status = None
76+
try:
77+
status = iconn.status(_indexname)
78+
logger.debug("Indexer status:%s" % status)
79+
except:
80+
iconn.create_index(_indexname)
81+
time.sleep(1)
82+
status = iconn.status(_indexname)
83+
mappings = { u'text': {'boost': 1.0,
84+
'index': 'analyzed',
85+
'store': 'yes',
86+
'type': u'string',
87+
"term_vector" : "with_positions_offsets"},
88+
u'url': {'boost': 1.0,
89+
'index': 'not_analyzed',
90+
'store': 'yes',
91+
'type': u'string',
92+
"term_vector" : "no"},
93+
u'title': {'boost': 1.0,
94+
'index': 'analyzed',
95+
'store': 'yes',
96+
'type': u'string',
97+
"term_vector" : "with_positions_offsets"},
98+
u'date': {'store': 'yes',
99+
'type': u'date'}}
100+
time.sleep(1)
101+
status = iconn.put_mapping(_doctype, mappings, _indexname)
102+
103+
104+
data = dict(url=url,
105+
title=msg.get('subject'),
106+
date=date,
107+
text=str(msg)
108+
)
109+
iconn.index(data, _indexname, _doctype)
110+
111+
syslog('debug', 'listname: %s, hostname: %s, url: %s, path: %s, msg: %s',
112+
listname, hostname, url, filepath, msg)
113+
except ClusterBlockException:
114+
syslog('error', 'Cluster in revocery state: listname: %s, hostname: %s, url: %s, path: %s, msg: %s',
115+
listname, hostname, url, filepath, msg)
116+
except NoServerAvailable:
117+
syslog('error', 'No server available: listname: %s, hostname: %s, url: %s, path: %s, msg: %s',
118+
listname, hostname, url, filepath, msg)
119+
except:
120+
import traceback
121+
syslog('error', 'Unknown: listname: %s, hostname: %s, url: %s, path: %s, msg: %s\nstacktrace: %s',
122+
listname, hostname, url, filepath, msg, repr(traceback.format_exc()))
123+
124+
return
125+
126+
def main():
127+
"""This is the mainline.
128+
129+
It first invokes the pipermail archiver to add the message to the archive,
130+
then calls the function above to do whatever with the archived message
131+
after it's URL and path are known.
132+
"""
133+
134+
listname = sys.argv[2]
135+
hostname = sys.argv[1]
136+
137+
# We must get the list unlocked here because it is already locked in
138+
# ArchRunner. This is safe because we aren't actually changing our list
139+
# object. ArchRunner's lock plus pipermail's archive lock will prevent
140+
# any race conditions.
141+
mlist = MailList.MailList(listname, lock=False)
142+
143+
# We need a seekable file for processUnixMailbox()
144+
f = StringIO(sys.stdin.read())
145+
146+
# If we don't need a Message.Message instance, we can skip the next and
147+
# the imports of email and Message above.
148+
msg = email.message_from_file(f, Message.Message)
149+
150+
h = HyperArch.HyperArchive(mlist)
151+
# Get the message number for the next message
152+
sequence = h.sequence
153+
# and add the message.
154+
h.processUnixMailbox(f)
155+
f.close()
156+
157+
# Get the archive name, etc.
158+
archive = h.archive
159+
msgno = '%06d' % sequence
160+
filename = msgno + '.html'
161+
filepath = os.path.join(h.basedir, archive, filename)
162+
h.close()
163+
164+
url = '%s%s/%s' % (mlist.GetBaseArchiveURL(), archive, filename)
165+
166+
ext_process(listname, hostname, url, filepath, msg)
167+
168+
if __name__ == '__main__':
169+
main()

0 commit comments

Comments
 (0)