Web::Scraper in Python hacks #1: Extract links linking to images

http://use.perl.org/~miyagawa/journal/34325の真似をできるようにした。

#!/usr/bin/env python2.5
from scraper import scraper, process
import codecs, sys
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

s = scraper(
    process("//a[contains(@href, '.jpg')]/img", links=lambda x: x.getparent().attrib['href']),
)

from pprint import pprint
pprint(s.scrape('http://manabekawori.cocolog-nifty.com/'))

$ ./imglinks.py 
{'links': ['http://manabekawori.cocolog-nifty.com/photos/uncategorized/2007/08/27/20070827220725_2.jpg',
           'http://manabekawori.cocolog-nifty.com/.shared/image.html?/photos/uncategorized/2007/08/10/20070804104553.jpg',
           'http://manabekawori.cocolog-nifty.com/.shared/image.html?/photos/uncategorized/2007/08/10/20070804104616.jpg',
           'http://manabekawori.cocolog-nifty.com/.shared/image.html?/photos/uncategorized/2007/07/31/20070731020351.jpg',
           'http://manabekawori.cocolog-nifty.com/.shared/image.html?/photos/uncategorized/070724_021614_1.jpg',
           'http://manabekawori.cocolog-nifty.com/.shared/image.html?/photos/uncategorized/070722_230739.jpg',
           'http://manabekawori.cocolog-nifty.com/.shared/image.html?/photos/uncategorized/070721_011653_1.jpg']}

lambdaサポートのために追加したのは下のコード。
ごちゃごちゃしてます。リスト内包使いたかった...

for key, val in attr.iteritems():
    if callable(val):
        stash[key] = []
        for e in tree.xpath(xpath):
            try:
                stash[key].append(val(e))
            except:
                pass                    
    elif isinstance(val, str):
        if val.startswith('@'):
            attr  = val[1:]
            stash[key] = [e.attrib[attr] for e in tree.xpath(xpath) if attr in e.attrib]
        elif val.upper() == "TEXT":
            stash[key] = [e.text for e in tree.xpath(xpath)]
    else:
        sys.stderr.write("Got an unknown thingy\n")
        sys.exit(1)