#!python ############################################################################# ## Grabber.py 1.1.0 ### ## Slashdot journal extraction ### ## Usage: python Grabber.py [username] ### ## ### ## ### ## Copyright 2005 Leons Petrazickis ### ## Leons.Petrazickis@gmail.com ### ## LPetr.org ### ## ### ## Distributed under the GNU General Public License v2 ### ## http://www.gnu.org/licenses/gpl.txt ### ############################################################################# import re; import sys; import threading; from time import sleep; import Queue; # fixes LookupError: unknown encoding: idna import encodings, encodings.idna, encodings.ascii; ############################################################################# ## Globals ### ############################################################################# def printDebug(stuff): sys.stderr.write(stuff + '\n'); pass; SYMBOL_STRIP = re.compile('[^a-zA-Z0-9 ]'); SINGLE_SPACE = re.compile('([^ ]+)'); NOBR = ' '; def camelCase(phrase): """given any sentence, turns it into a Java-style camelCase name""" phrase = SYMBOL_STRIP.sub('', phrase); words = SINGLE_SPACE.findall(phrase); (phrase, words) = (words[0].lower(), words[1:]); for word in words: phrase += word.capitalize(); return phrase; def autoLineBreak(blurb): """given text with
s and

s, adds sane linebreaks""" blurb = blurb.replace('\n', ''); blurb = blurb.replace('
', '
\n'); #blurb = blurb.replace('
', '
\n'); blurb = blurb.replace('

', '\n

'); blurb = blurb.replace('

', '

\n'); return blurb; ############################################################################# ## JournalEntry ### ############################################################################# class JournalEntry: path = "./"; # path to file output #prefix = "temp" suffix = ".txt"; numFormat = "%08d"; DIGEST_LEN = 256; # the max length of an entry summary HTML_STRIP = re.compile('<[^>]+>'); def __init__(self, username, id): self.user = username; self.title = ''; self.date = ''; self.id = int(id); self.body = ''; self.comments = []; def setTitle(self, title): self.title = title; def getTitle(self): return self.title; def setDate(self, date): self.date = date; def getDate(self): return self.date; def setID(self, id): self.id = int(id); def getID(self): return self.id; def setBody(self, body): self.body = body.replace('', '').replace(NOBR, ''); def getBody(self): return self.body; def addBody(self, body): self.body += body.replace('', '').replace(NOBR, ''); def addComment(self, comment): self.comments += [comment]; def getComments(self): return self.comments; def __str__(self): comments = [str(comment) for comment in self.comments]; entry = (self.title, '\n', self.date, '\n\n', autoLineBreak(self.body), '\n\n', '\n\n'.join(comments)); return ''.join(entry); """ creates an HTML fragment for a table of contents """ def getIndexItem(self): # create a summary of the entry digest = self.body[:self.DIGEST_LEN].replace('\n', ''); digest = self.HTML_STRIP.sub('', digest); digest = digest.replace('>', '').replace('<', ''); tokens = (self.user, self.id, self.title, digest); return "%s: %s" % tokens; """ writes entry to disk """ def write(self): suffix = '_' + camelCase(self.title) + str(self.suffix); f = open(self.path + str(self.numFormat % self.id) + suffix, 'w'); f.write(str(self)); f.close(); ############################################################################# ## JournalComment ### ############################################################################# class JournalComment: def __init__(self, subject): self.subject = subject; self.user = ''; self.id = 0; self.date = ''; self.body = ''; def getSubject(self): return self.subject; def setUser(self, user): self.user = user; def getUser(self): return self.user; def setID(self, id): self.id = int(id); def getID(self): return self.id; def setDate(self, date): self.date = date; def getDate(self): return date; def setBody(self, body): self.body = body.replace('\t\t\t', '').replace(NOBR, ''); def getBody(self): return self.body; def addBody(self, body): self.body += body.replace('\t\t\t', '').replace(NOBR, ''); def __str__(self): return ''.join( ['Subject: ', self.subject, '\n', 'From: ', self.user, '\n', '\n', autoLineBreak(self.body), '\n', '\t***'] ); ############################################################################# ## Grabber ### ############################################################################# class Grabber(threading.Thread): # SLEEPTIME = 0.5; # seconds NEXT_PREFIX = 'http://slashdot.org/journal.pl?op=display&uid='; MAX_QUEUE_SIZE = 0; #infinite # valid instance states STATE_GREY = 'neutral grey'; STATE_TITLE = 'title'; STATE_DATE = 'date'; STATE_BODY = 'body'; STATE_TAIL = 'tail'; STATE_COM_TITLE = 'ctitle'; STATE_COM_MAKER = 'cmaker'; STATE_COM_DATE = 'cdate'; STATE_COM_TEXT = 'cbody'; # helper regex for transitions between states REGEX_TITLE = re.compile('^

(.*)

'); REGEX_DATE = re.compile('\s*(.*@.*)'); REGEX_BODY_OPEN = re.compile('^\s*
(.*)(
)?'); REGEX_BODY_CLOSE = re.compile('^(.*)\s*'); REGEX_COM_TITLE = re.compile('^\s*

(.*)

'); REGEX_COM_MAKER = re.compile('^\s*by (.+) \([0-9]+\)') REGEX_COM_DATE = re.compile('^\s*on (.+) \('); REGEX_COM_TEXT = re.compile('^\s*
'); REGEX_COM_END = re.compile('^\s*
'); REGEX_UID = None; REGEX_LINK = re.compile('^\s*'); def makeUserURL(self, username): return 'http://slashdot.org/~' + username + '/'; def makeListURL(self, userid): return 'http://slashdot.org/journal.pl?op=list&uid=' + userid; def makeEntryURL(self, username, id): return 'http://slashdot.org/~' + username + '/journal/' + id + '?threshold=-1&mode=nested&commentsort=0'; def __init__(self, username): threading.Thread.__init__(self); from urllib import quote; self.done = False; self.state = self.STATE_GREY; self.user = quote(username); self.uid = None; self.url = self.makeUserURL(self.user); self.REGEX_UID = re.compile("^ %s \(([0-9]+)\)" % username, re.IGNORECASE); # Current journal entry self.queue = Queue.Queue(self.MAX_QUEUE_SIZE); self.entry = None; self.comment = None; # List of journals self.journalIDs = []; def run(self): url = self.url; from urllib2 import urlopen; # get the user id for line in urlopen(url): m = self.REGEX_UID.match(line); if m != None: self.uid = m.group(1); break; sleep(self.SLEEPTIME); # can't continue without user id if self.uid == None: return; # get the journal list url = self.makeListURL(self.uid); for line in urlopen(url): m = self.REGEX_LINK.match(line); if m != None: self.journalIDs += [m.group(1)]; sleep(self.SLEEPTIME); # get the journal entries for id in self.journalIDs: # create the skeleton entry self.entry = JournalEntry(self.user, id); # fill in the blanks url = self.makeEntryURL(self.user, id); print url; for line in urlopen(url): # if just starting if self.state == self.STATE_GREY: # if a journal title m = self.REGEX_TITLE.match(line); if m != None: self.entry.setTitle(m.group(1)); self.state = self.STATE_TITLE; continue; # if already grabbed a title elif self.state == self.STATE_TITLE: m = self.REGEX_DATE.match(line); if m != None: self.entry.setDate(m.group(1)); self.state = self.STATE_DATE; continue; # if already grabbed a date elif self.state == self.STATE_DATE: m = self.REGEX_BODY_OPEN.match(line); if m != None: self.state = self.STATE_BODY; self.entry.addBody(m.group(1)); continue; # if currently grabbing the body elif self.state == self.STATE_BODY: m = self.REGEX_BODY_CLOSE.match(line); # if the body has ended if m != None: self.entry.addBody(m.group(1)); self.state = self.STATE_TAIL; continue; else: self.entry.addBody(line); continue; # once in comment mode elif self.state == self.STATE_TAIL: # find reply subject m = self.REGEX_COM_TITLE.match(line); if m != None: self.comment = JournalComment(m.group(1)); self.state = self.STATE_COM_TITLE; # found reply subject, looking for person's name elif self.state == self.STATE_COM_TITLE: # find responder's name m = self.REGEX_COM_MAKER.match(line); if m != None: self.comment.setUser(m.group(1)); self.state = self.STATE_COM_MAKER; # found person's name, looking for date elif self.state == self.STATE_COM_MAKER: # find date m = self.REGEX_COM_DATE.match(line); if m != None: self.comment.setDate(m.group(1)); self.state = self.STATE_COM_DATE; # found comment datetime, looking for comment text start elif self.state == self.STATE_COM_DATE: # find start of comment body m = self.REGEX_COM_TEXT.match(line); if m != None: self.state = self.STATE_COM_TEXT; # eat up comment body text elif self.state == self.STATE_COM_TEXT: # unless comment is over m = self.REGEX_COM_END.match(line); if m != None: self.entry.addComment(self.comment); self.comment = None; self.state = self.STATE_TAIL; else: self.comment.addBody(line); # having parsed every line if self.comment != None: self.entry.addComment(self.comment); self.comment = None; self.entry.write(); self.queue.put(self.entry); self.entry = None; self.state = self.STATE_GREY; # pause between journals sleep(self.SLEEPTIME); self.done = True; ############################################################################# ## Main ### ############################################################################# if __name__ == "__main__": # Whom to parse if len(sys.argv) > 1: username = '' + sys.argv[1]; for word in sys.argv[2:]: username += ' ' + word; else: sys.stderr.write('Usage: Grabber [username]\n'); sys.exit(-1); # Parse the whom g = Grabber(username); g.start(); while not (g.done and g.queue.empty()): try: je = g.queue.get(False); except Queue.Empty: continue; print je.date; g.join(); """ This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA """