#!/home/yanbe/local/bin/python import cPickle import os import urllib import urllib2 import httplib import sys import xmlrpclib import datetime import feedparser import simplejson from BeautifulSoup import BeautifulSoup import proxy ''' Recent updates 0.9.6 - added "hateb" alias as top level object 0.9.5 - implemented __repr__ so as not to get error if called like "hatena.b.hot" . Now hatena.b['http://example.com'].count and hatena.b['http://example.com'].eid returns int. 0.9.4 - removed unnecessary class inheritance (dict). ''' __doc__ = '''Hatena Bookmark Object - O/R mapping like module for Hatena bookmark Query examples: b.new.python.entries -> returns a generator of (url, title, summary, description, [tags]) where entries recently bookmarked with specifiled tag b.new.python.urls -> returns a generator of urls. b.new.python.titles -> return a generator of titles. b.new.python.comments -> return a generator of comments. b.new.python.descriptions -> return a generator of descriptions. b.new.python.tags -> return a generator of tags of each entries. b.new.python.urls_tags -> returns a generator of specified items. item\'s order is free, i.e. "tags_urls" is also valid. b.new.python.tagdict -> return a dictionary {tag: count} where collected usage of tags. b.new['web2.0'].* - dictionary like accesss also available. (for tags cannot represent python symbol) b.hot.threshold <- set least number of user for count in "hot". b.hot.*.* -> similar to "b.new" ,except for these come form "recently hot" entries in a specified tag. b.count.*.* -> similar to "b.new", except for these come from "most bookmarked" entries so far. b.hotentry.* -> similar to "b.new", except for these come from "recentry hot" entries and does not specify tag. b.hotentry.category.* -> similar to "b.hotentry.*", except for filter though specified "category". Category list is avaliable via "b.hotentry.categories". b.user_id.* -> similar to "b.new", except for this returns spcified user_id\'s information. b.user_id.python.* -> user and tag specified query is also available b['user_id'].* -> dictionary like access also available. (for user_ids cannot represent python symbol like '-') b['url'].info -> returns an entry information of each user, i.e. (user_id, [tags], comment, timestamp) as generator. b['url'].users -> return user_ids as generator. b['url'].tags -> returns a dictionary like {tag: count}. b['url'].comments -> return comments as generator. b['url'].timestamps -> return timestamps as generator. b['url'].url -> returns url of this url. b['url'].title -> returns title of this url. b['url'].count -> returns number of users who bookmarked this url. b['url'].entry_url -> returns entry url on Hatena Bookmark. b['url'].eid -> returns entry_id of this url on Hatena Bookmark. b['url'].screenshot -> retuns screenshot url of Hatena Bookmark. bookmark, hateb - alias of "b". ''' __author__ = 'Yusuke Yanbe' __date__ = '10-18-2006' __version__ = '0.9.5' __license__ = 'GPL' __copyright__ = 'Copyright (c) Yusuke Yanbe, 2006' __all__ = ['b', 'bookmark', 'hateb', 'feed_cache', 'json_cache', 'getCount'] bookmark_base_url = 'http://b.hatena.ne.jp/' class generate_iterator: trans = {'urls': 'link', 'titles': 'title', 'comments': 'summary', 'descriptions': 'description', 'tags': '_tags'} def __init__(self, d): entries = [] for e in d.entries: e._tags = [tag.term for tag in ('tags' in e and e.tags or [])] entries.append(e) d.entries = entries self.d = d def __getattr__(self, attr): if attr == 'entries': return ((e.link, e.title, e.summary, e.description, e._tags) for e in self.d.entries) elif attr == 'tagdict': def combine_tags(a, b): for tag in b._tags: a[tag] = tag in a and a[tag]+1 or 1 return a return reduce(combine_tags, self.d.entries, {}) else: try: attrs = [self.trans[item] for item in attr.split('_')] except KeyError, e: raise KeyError('Attribute "%s" does not supported' % attr) g = (tuple([getattr(e, a) for a in attrs]) for e in self.d.entries \ if attr.find('comments') == -1 or 0 < len(e.summary < 100)) return len(attrs) == 1 and (i[0] for i in g) or g class generate_iterator_json: trans = {'users': 'user', 'tags': 'tags', 'comments': 'comment', 'timestamps': 'timestamp'} def __init__(self, j): self.j = j def __getattr__(self, attr): if not self.j: return {} if attr == 'info': return ((b['user'], b['tags'], b['comment'], b['timestamp']) for b in self.j['bookmarks']) elif attr in ('url','title','entry_url','screenshot'): return self.j[attr] elif attr in ('count','eid'): return int(self.j[attr]) elif attr == 'tagdict': def combine_tags(a, b): for tag in b['tags']: a[tag] = tag in a and a[tag]+1 or 1 return a return reduce(combine_tags, self.j['bookmarks'], {}) else: try: attrs = [self.trans[item] for item in attr.split('_')] except KeyError, e: raise KeyError('Attribute "%s" does not supported' % attr) g = (tuple([b[a] for a in attrs]) for b in self.j['bookmarks'] \ if attr != 'comments' or b['comment']) return len(attrs) == 1 and (i[0] for i in g) or g def __repr__(self): return '' % id(self) class URLPickleCache(proxy.HTTPCache): def __init__(self,cache_root): self.cache_root = cache_root def __getitem__(self, path): return cPickle.load(file(self.filename(path), 'r')) def __setitem__(self, path, data): _dir = self.dirname(path) if not os.access(_dir, os.F_OK): os.makedirs(_dir) try: cPickle.dump(data, file(self.filename(path), 'w')) except Exception, e: pass #print e #print 'skipping dump cache' feed_cache = URLPickleCache('/tmp/feed_cache') def parse_feed(feed_url, use_cache=True): if feed_url in feed_cache: return feed_cache[feed_url] else: d = feedparser.parse(feed_url) feed_cache[feed_url] = d return d class new: threshold = 5 url_tmpl = bookmark_base_url + 't/%s?mode=rss&sort=eid&threshold=%s' def __getattr__(self, attr): return self.__getitem__(attr) def __getitem__(self, attr): return generate_iterator(parse_feed(self.url_tmpl % (attr,self.threshold))) def __repr__(self): return '' % id(self) class hot(new): threshold = 3 url_tmpl = bookmark_base_url + 't/%s?mode=rss&sort=hot&threshold=%s' def __getitem__(self, attr): return generate_iterator(parse_feed(self.url_tmpl % (attr, self.threshold))) class count(new): url_tmpl = bookmark_base_url + 't/%s?mode=rss&sort=count&threshold=%s' suffixes_base = '', 'entries', 'urls', 'titles', 'comments',\ 'descriptions', 'tags' suffixes = [] for s1 in suffixes_base: for s2 in suffixes_base: if not s1 == s2 and s1 and s2: suffixes.append(s1+'_'+s2) elif s1 and not s2: suffixes.append(s1) elif not s1 and s2: suffixes.append(s2) suffixes.append('tagdict') class hotentry: category = None categories = ('book', 'music', 'movie', 'web', 'elec', 'animal', 'anime', 'food', 'sports', 'game', 'comic', 'idol', 'geography', 'art', 'science', 'hatena', 'none') def __getattr__(self, attr): if attr in suffixes: url = bookmark_base_url + 'hotentry?mode=rss' + \ (self.category and '&cname=' + self.category or '') self.category = None return getattr(generate_iterator(parse_feed(url)), attr) elif not self.category: self.category = attr return self else: raise NameError def __getitem__(self, attr): if attr == 'entries': attr = '' elif attr not in self.categories: raise NameError return generate_iterator(parse_feed(self.url_tmpl % attr)) class user_info: user_id = None tag = None def __getattr__(self, attr): if attr in suffixes: url = bookmark_base_url + self.user_id + '/rss' + \ (self.tag and '?tag=' + self.tag or '') self.user_id, self.tag = None, None return getattr(generate_iterator(parse_feed(url)), attr) elif not self.tag: self.tag = attr return self else: raise NameError def __getitem__(self, attr): if not self.user_id: self.user_id = attr elif not self.tag: self.tag = attr return self def __repr__(self): return '' % id(self) json_cache = URLPickleCache('/home/yanbe/tmp/json_cache') class entry_info: json_base = 'entry/json/' def __getitem__(self, attr): url = bookmark_base_url+self.json_base+urllib.quote(attr) if url in json_cache: json = json_cache[url] else: for i in range(3): # retry up to 3 times try: response = urllib2.urlopen(url).read() except Exception, e: print e print 'try to read again (%d/3 time(s)' % (i+1) continue break json = simplejson.loads(response[1:-1]) json_cache[url] = json return generate_iterator_json(json) class bookmark_info: new = new() hot = hot() count = count() hotentry = hotentry() entry_info = entry_info() user_info = user_info() def __getattr__(self, attr): return self.__getitem__(attr) def __getitem__(self, attr): if attr.startswith('http://') or attr.startswith('https://'): return self.entry_info[attr] else: return self.user_info[attr] def __repr__(self): return '' % id(self) bookmark = bookmark_info() b = bookmark hateb = b server_url = 'http://b.hatena.ne.jp/xmlrpc' server = xmlrpclib.Server(server_url) def getCount(*urls): return server.bookmark.getCount(*urls) def getHotEntry(date): url = 'http://b.hatena.ne.jp/hotentry?mode=daily&date='+str(date) for i in range(3): try: data = urllib2.urlopen(url).read() except Exception, e: print e continue break soup = BeautifulSoup(data.decode('utf8', 'replace')) for a in soup('a', {'class':'bookmark'}): yield a['href'], a.contents[0] def daysToBeDiscovered(url): from email.Utils import parsedate h = urllib2.urlopen(url).headers t = list(b[url].timestamps) if not h.has_key('last-modified') or len(t) == 0: return None firstDate = t[-1] parser=(0,4),(5,7),(8,10),(11,13),(14,16),(17,19) firstData=[] for begin,end in parser: firstData.append(firstDate[begin:end]) firstDate = datetime.datetime(*map(int,firstData)) #print h['last-modified'] lastModified = datetime.datetime(*parsedate(h['last-modified'])[:-2]) lastModified += datetime.timedelta(hours=9) # assume GMT d = firstDate - lastModified zeroDay = datetime.timedelta() if d < zeroDay: return -1 else: return d if __name__ == '__main__': for u in b.naoya.urls: d = daysToBeDiscovered(u) if d == None: print 'n/a',u elif d == -1: print 'mod',u else: print d,u # for i in range(1,4): # current = today - datetime.timedelta(i) # for url,title in getHotEntry(current.strftime('%Y%m%d')): # print title # print url