もろもろハッカーニュースアンケート



#!/usr/bin/env python

# -*- coding: utf-8 -*-

import sys,os

import re

import time

import lxml.html

import urllib2

import urllib

import jinja2
default_template="""

<div>

  <a href="{{original_url}}"><h3>{{title}}</h3></a>

  <p>

    <a href="{{original_url}}">{{original_url}}</a>

  </p>

  

  {% for vote in votes %}

  <span>{{vote.0}} ({{vote.1}})</span>

  <div style="width:{{vote.1}}px; height: 1em; background:#cceecc; border: 1px solid #000; margin-bottom: 1em;" >

    

  </div>

  {% endfor %}

</div>

"""
def normalized_votes(trs, max_width=1000):

    vts=list(votes(trs))

    max_score=max([ pair[1] for pair in vts ])

    return [ (label, max_width*score/max_score) for label, score in  vts ]
def votes(trs):

    """vote row elements to sequence of pairs (label, score)

    """

    while True:

        try:

            item=trs.next()

            comment=trs.next()

            spacer=trs.next()

        except StopIteration:

            break

        

        m=re.match(r'^(\d+)\s*\w+', comment.text_content())

        if not m:

            raise RuntimeError("failed to parse vote: " + str(item))

        score=int(m.group(1))
        yield (item.text_content(), score)
def parse(html_str, **opt):

    """ html_string  -->  page data dict to be rendered.

    """

    lx = lxml.html.fromstring(html_str)

    votes_table=lx.find('./body/center/table/tr/td/table/tr/td/table')

    title_td=lx.xpath('.//td[@class="title"]')[0]

    title_link=title_td.find('.//a')

    return dict(votes=normalized_votes(iter(votes_table), **opt),

                title=title_td.text_content(),

                original_url=None if title_link is None \

                    else 'http://news.ycombinator.com/' + title_link.get('href'),

                )
def resolve_template(template=None):
#    tdir,tname=os.path.split(template)

#    env=jinja2.Environment(loader=jinja2.FileSystemLoader(tdir))

#    return env.get_template(tname)
    if template is None:

        template_str=default_template

    elif os.path.exists(template):

        template_str=file(template).read().decode('utf8')

    else:

        template_str=template

    return jinja2.Environment().from_string(template_str)
def fetch(url):

    """Resolve url|or-html-file to html string.

       A simple caching is implemented so that you dont get banned from hn for repeated use.

    """

    cache_file=os.path.join('/var/tmp/', 'hn-poll-'+urllib.quote(url).replace('/','-'))
    if not url.lower().startswith('http://'):

        # accept local file

        html_file=url

        return file(html_file).read().decode('utf8')
    elif os.path.exists(cache_file) and time.time()-os.stat(cache_file).st_mtime<=3600:

        # cache hit

        print >>sys.stderr, 'cache-hit:', cache_file

        html=file(cache_file).read()
    else:

        # fetch

        print >>sys.stderr, 'fetching:', url

        html=urllib2.urlopen(url).read()

        tmp_file=cache_file+'.tmp'

        file(tmp_file, 'w').write(html)

        os.rename(tmp_file, cache_file)
    return html.decode('utf8')
if __name__=='__main__':
    import baker

    import json
    @baker.command

    def as_json(html_file, indent=None):

        """dumps (item,vote) pairs as json"""
        print json.dumps(parse(file(html_file).read()), 

                         indent=indent if indent is None else int(indent))
    @baker.command

    def as_html(hackernews_poll_url, template=None, max_width=1000):

        """render hackernews poll url into html graph.

        """

        

        html=fetch(hackernews_poll_url)

        template=resolve_template(template)

        print template.render(**parse(html, max_width=int(max_width)))
    baker.run()