mylifeisaverage.py
This script fetches MLIA pages, parses the stories out, and stores them in an SQLite Database. The user can then run regex queries against the data to find interesting things, like how many people end their post with MLIA.
TODO:
Read list of failed pages and import them- Associate IDs so we stop fetching when we start getting hits
mylifeisaverage.py
import sys, os, re, urllib, threading, getopt, time import BeautifulSoup from sqlobject import * total_pages = 294 max_threads = 20 current_threads = 0 current_page = 1 all_entries = [] failed_pages = [] lock = threading.Lock() class Meh(SQLObject): entry = StringCol() class MLIAPage(): def __init__(self, data): self.soup = BeautifulSoup.BeautifulSoup(data) self.entries = [] for x in self.soup.findAll("span", id=re.compile("ls_contents-[0-9]*")): self.entries.append(x.next.strip()) class AverageThread(threading.Thread): count = 0 active = 0 def __init__(self, page_number): AverageThread.count = AverageThread.count + 1 AverageThread.active = AverageThread.active + 1 self.id = AverageThread.count self.page_number = page_number threading.Thread.__init__(self) def run(self): global current_threads global all_entries global entries_lock print "Thread #" + str(self.id) + ": opening page " + "http://mylifeisaverage.com/index.php?page=" + str(self.page_number) try: data = urllib.urlopen("http://mylifeisaverage.com/index.php?page=" + str(self.page_number)).read() self.page = MLIAPage(data) if lock.locked(): print "Lock is locked" lock.acquire() try: for x in self.page.entries: all_entries.append(x) finally: lock.release() except IOError: print "Thread #" + str(self.id) + ": failed to open MLIA page " + str(self.page_number) lock.acquire() try: failed_pages.append(self.page_number) finally: lock.release() current_threads -= 1 AverageThread.active = AverageThread.active - 1 def initDB(): db_file = os.path.abspath("meh.db") maketables = False if not os.path.exists(db_file): maketables = True connstring = "sqlite:/" + db_file.replace("C:\\", "C|/") print connstring connection = connectionForURI(connstring) sqlhub.processConnection = connection if maketables: Meh.createTable() def update(): global current_threads global current_page global max_threads global lock while True: if current_threads < max_threads and current_page <= total_pages: print "Spawning thread for page " + str(current_page) current_threads += 1 AverageThread(current_page).start() current_page += 1 if current_page > total_pages: break while AverageThread.active > 0: print "waiting for threads to finish" time.sleep(2) e = open("errors.txt", "w") for x in all_entries: try: Meh(entry=x.encode("utf-8")) except UnicodeEncodeError: e.write(x.encode("utf-8") + "\n") e.close() e = open("failed_pages.txt", "w") for x in failed_pages: e.write(str(x) + "\n") e.close() def update_pages(): global current_threads global current_page global max_threads global lock f = open("failed_pages.txt", "r") for x in f: if (x == ""): break while current_threads > max_threads: time.sleep(2) current_threads += 1 AverageThread(int(x.strip())).start() while AverageThread.active > 0: print "waiting for threads to finish" time.sleep(2) e = open("errors.txt", "w") for x in all_entries: try: Meh(entry=x.encode("utf-8")) except UnicodeEncodeError: e.write(x.encode("utf-8")) e.close() e = open("failed_pages.txt", "w") for x in failed_pages: e.write(str(x)) e.close() def query(regex, ignorecase): all = Meh.select() r = None if ignorecase: r = re.compile(regex, re.IGNORECASE) else: r = re.compile(regex) count = 0 for e in all: m = r.search(e.entry) if m: print "Match: " + e.entry count = count + 1 print "Query matched " + str(count) + " entries out of " + str(all.count()) if __name__ == "__main__": optlist, args = getopt.getopt(sys.argv[1:], 'piq:', ["query=", "update"]) initDB() action = "" extra = "" ignore = False pages = False for o, a in optlist: if o == "-q" or o == "--query": action = "query" extra = a elif o == "-i": ignore = True elif o == "-p": pages = True elif o == "--update": action = "update" if action == "query": query(extra, ignore) elif action == "update": if pages: update_pages() else: update()