import sqlite3 from os.path import join from time import gmtime, strftime from mirror.utils import relevance, optvalue class Attributes(object): def __init__(self, path): self.db = path @classmethod def from_crawler(cls, crawler): return cls(join(crawler.settings.get('RESULTS'), "%s.sqlite3" %(strftime('%Y%m%d%H%M%s', gmtime())))) def open_spider(self, spider): self.conn = sqlite3.connect(self.db, isolation_level = None) self.cur = self.conn.cursor() self.cur.execute("CREATE TABLE Attributes (url text PRIMARY KEY, keywords int, words int, relevancy int, tags int, semantics int, medias int, links int, injections int)") def close_spider(self, spider): self.conn.close() def process_item(self, item, spider): self.cur.execute('INSERT INTO Attributes VALUES (?,?,?,?,?,?,?,?,? )', (item['url'][0], len(item['keywords']), len(optvalue(item, 'words')), relevance(optvalue(item, 'keywords'), optvalue(item, 'words')), len(optvalue(item, 'tags')), len(optvalue(item, 'semantics')), len(optvalue(item, 'medias')), len(optvalue(item, 'links')), len(optvalue(item, 'injections')))) return item