mine-pubs

#!/usr/bin/env python

import codecs
import locale
import sys
import urllib
import logging
import textwrap

from argparse import ArgumentParser

from infoscience import InfoscienceEntry, InfoscienceParser
from string import Template

html_standalone_template = Template("""\
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
  <title>$title</title>
  <link href="dslab-infoscience-common.css" rel="stylesheet" type="text/css"/>
  <link href="dslab-standalone.css" rel="stylesheet" type="text/css"/>
</head>
<body>
<h1>$title</h1>
<div class="iscollection" id="paperlist">
$content
</div>
</body>
</html>
""")

html_embedded_template = Template("""\
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
  <title>$title</title>
  <link href="dslab-infoscience-common.css" rel="stylesheet" type="text/css"/>
</head>
<body>
<div class="iscollection" id="paperlist">
$content
</div>
</body>
</html>
""")

html_year_header = Template("""\
<h4 class="isyear">$year</h4>
""")

html_record_url = Template("""\
<div class="isrecord">
  <span class="istitle"><a href="$url">$title</a></span>.
  <span class="isauthors">$authors</span>
</div>
""")

html_record_plain = Template("""\
<div class="isrecord">
  <span class="istitle">$title</span>.
  <span class="isauthors">$authors</span>
</div>
""")

base_infoscience_url = "http://infoscience.epfl.ch/search"

search_cc = "Infoscience/Research/IC/IIF/DSLAB"
search_p = ""
search_of = "xm"


def _formatAuthors(authors):
    aexp = [a[1] + " " + a[0] for a in authors]
    if len(aexp) > 2:
        astr = ", ".join(aexp[0:-1]) + ", and " + aexp[-1]
    elif len(aexp) == 2:
        astr = " and ".join(aexp)
    else:
        astr = aexp[0]
        
    return astr

def printText(collection):
    years = set([entry.year for entry in collection.itervalues()])

    for year in sorted(years, reverse=True):
        print "%d: " % year
        entries = filter(lambda entry: entry.year == year, collection.itervalues())
        for entry in sorted(entries, key=lambda entry: entry.id, reverse=True):
            print "* '%s', %s" % (entry.title, _formatAuthors(entry.authors))

def printHTML(collection, genbody=True, standalone=False, minyear=0):
  years = set([entry.year for entry in collection.itervalues()])
  for year in sorted(years, reverse=True):
    if year <= minyear:
      header = html_year_header.substitute(year="Pre-%d" % minyear)
      entries = filter(lambda entry: entry.year <= year, collection.itervalues())
    else:
      header = html_year_header.substitute(year=minyear)
      entries = filter(lambda entry: entry.year == year, collection.itervalues())

    for entry in sorted(entries, key=lambda entry: (entry.year, entry.id), reverse=True):
      if entry.paperurl:
        record = html_record_url.substitute(url=entry.paperurl, title=entry.title,
                                            authors=_formatAuthors(entry.authors))
      else:
        record = html_record_plain.substitute(title=entry.title,
                                              authors=_formatAuthors(entry.authors))

def printHTML(collection, genbody=True, standalone=False):
    years = set([entry.year for entry in collection.itervalues()])

    if genbody:
        print textwrap.dedent("""\
              <html xmlns="http://www.w3.org/1999/xhtml">
              <head>
                <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
                <title>Infoscience</title>
              </head>
              <body>""")
    print """<div class="iscollection" id="paperlist">"""
    
    
    for year in sorted(years, reverse=True):
        print """<h4 class="isyear">%s</h4>""" % year
        entries = filter(lambda entry: entry.year == year, collection.itervalues())
        for entry in sorted(entries, key=lambda entry: entry.id, reverse=True):
            print textwrap.dedent("""\
                  <div class="isrecord">
                    <span class="istitle">%s</span>.
                    <span class="isauthors">%s</span>
                  </div>""" % ("""<a href="%s">%s</a>""" % (entry.paperurl, entry.title)
                                if entry.paperurl else entry.title,
                                _formatAuthors(entry.authors)))
            
    print """</div>"""
    if genbody:
        print textwrap.dedent("""\
              </body>
              </html>""")

def main():
    sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
    
    parser = ArgumentParser(description="InfoScience publication parser.")
    parser.add_argument("-f", "--file", help="Parse from external file.")
    parser.add_argument("--bare", action="store_true", default=False,
                        help="Output bare HTML (without body).")
    parser.add_argument("--stand-alone", action="store_true", default=False,
                        help="Output stand-alone HTML document.")

    args = parser.parse_args()

    collection = { }
    parser = InfoscienceParser(collection)

    if args.file:
        with open(args.file, "r") as f:
            data = f.read()
        parser.parseXML(data)
    else:
        offset = 1
        while True:
            url = "%s?cc=%s&of=%s&jrec=%d" % (base_infoscience_url, search_cc, 
                                              search_of, offset)
            logging.info("Crawling %s" % url)
            c = urllib.urlopen(url)
            data = c.read()
            total = parser.parseXML(data)
            if not total:
                break
            offset += total

    printHTML(collection, genbody=not args.bare, standalone=args.stand_alone)

if __name__ == "__main__":
    sys.exit(main())