mylifeisaverage.py

This script fetches MLIA pages, parses the stories out, and stores them in an SQLite Database. The user can then run regex queries against the data to find interesting things, like how many people end their post with MLIA.

TODO:

  • Read list of failed pages and import them
  • Associate IDs so we stop fetching when we start getting hits

mylifeisaverage.py

import sys, os, re, urllib, threading, getopt, time
import BeautifulSoup
from sqlobject import *
 
total_pages = 294
max_threads = 20
current_threads = 0
current_page = 1
all_entries = []
failed_pages = []
 
lock = threading.Lock()
 
class Meh(SQLObject):
	entry = StringCol()
 
class MLIAPage():
	def __init__(self, data):
		self.soup = BeautifulSoup.BeautifulSoup(data)
		self.entries = []
		for x in self.soup.findAll("span", id=re.compile("ls_contents-[0-9]*")):
			self.entries.append(x.next.strip())
 
class AverageThread(threading.Thread):
	count = 0
	active = 0
 
	def __init__(self, page_number):
		AverageThread.count = AverageThread.count + 1
		AverageThread.active = AverageThread.active + 1
		self.id = AverageThread.count
		self.page_number = page_number
 
		threading.Thread.__init__(self)
 
	def run(self):
		global current_threads
		global all_entries
		global entries_lock
 
		print "Thread #" + str(self.id) + ": opening page " + "http://mylifeisaverage.com/index.php?page=" + str(self.page_number)
		try:
			data = urllib.urlopen("http://mylifeisaverage.com/index.php?page=" + str(self.page_number)).read()
			self.page = MLIAPage(data)
			if lock.locked():
				print "Lock is locked"
 
			lock.acquire()
			try:
				for x in self.page.entries:
					all_entries.append(x)
			finally:
				lock.release()
		except IOError:
			print "Thread #" + str(self.id) + ": failed to open MLIA page " + str(self.page_number)
			lock.acquire()
			try:
				failed_pages.append(self.page_number)
			finally:
				lock.release()
 
		current_threads -= 1
		AverageThread.active = AverageThread.active - 1
 
def initDB():
	db_file = os.path.abspath("meh.db")
	maketables = False
	if not os.path.exists(db_file):
		maketables = True
	connstring = "sqlite:/" + db_file.replace("C:\\", "C|/")
	print connstring
	connection = connectionForURI(connstring)
	sqlhub.processConnection = connection
	if maketables:
		Meh.createTable()
 
def update():
	global current_threads
	global current_page
	global max_threads
	global lock
 
	while True:
		if current_threads < max_threads and current_page <= total_pages:
			print "Spawning thread for page " + str(current_page)
			current_threads += 1
			AverageThread(current_page).start()
 
			current_page += 1
 
		if current_page > total_pages:
			break
 
	while AverageThread.active > 0:
		print "waiting for threads to finish"
		time.sleep(2)
 
	e = open("errors.txt", "w")
	for x in all_entries:
		try:
			Meh(entry=x.encode("utf-8"))
		except UnicodeEncodeError:
			e.write(x.encode("utf-8") + "\n")
 
	e.close()
 
	e = open("failed_pages.txt", "w")
	for x in failed_pages:
		e.write(str(x) + "\n")
	e.close()
 
def update_pages():
	global current_threads
	global current_page
	global max_threads
	global lock
 
	f = open("failed_pages.txt", "r")
	for x in f:
		if (x == ""):
			break
 
		while current_threads > max_threads:
			time.sleep(2)
 
		current_threads += 1
		AverageThread(int(x.strip())).start()
 
	while AverageThread.active > 0:
		print "waiting for threads to finish"
		time.sleep(2)
 
	e = open("errors.txt", "w")
	for x in all_entries:
		try:
			Meh(entry=x.encode("utf-8"))
		except UnicodeEncodeError:
			e.write(x.encode("utf-8"))
 
	e.close()
 
	e = open("failed_pages.txt", "w")
	for x in failed_pages:
		e.write(str(x))
	e.close()
 
def query(regex, ignorecase):
	all = Meh.select()
	r = None
	if ignorecase:
		r = re.compile(regex, re.IGNORECASE)
	else:
		r = re.compile(regex)
	count = 0
	for e in all:
		m = r.search(e.entry)
		if m:
			print "Match: " + e.entry
			count = count + 1
 
	print "Query matched " + str(count) + " entries out of " + str(all.count())
 
if __name__ == "__main__":
	optlist, args = getopt.getopt(sys.argv[1:], 'piq:', ["query=", "update"])
 
	initDB()
 
	action = ""
	extra = ""
	ignore = False
	pages = False
 
	for o, a in optlist:
		if o == "-q" or o == "--query":
			action = "query"
			extra = a
		elif o == "-i":
			ignore = True
		elif o == "-p":
			pages = True
		elif o == "--update":
			action = "update"
 
	if action == "query":
		query(extra, ignore)
	elif action == "update":
		if pages:
			update_pages()
		else:
			update()
 
  1. No comments yet.
  1. No trackbacks yet.