#!python
#############################################################################
## Grabber.py 1.1.0 ###
## Slashdot journal extraction ###
## Usage: python Grabber.py [username] ###
## ###
## ###
## Copyright 2005 Leons Petrazickis ###
## Leons.Petrazickis@gmail.com ###
## LPetr.org ###
## ###
## Distributed under the GNU General Public License v2 ###
## http://www.gnu.org/licenses/gpl.txt ###
#############################################################################
import re;
import sys;
import threading;
from time import sleep;
import Queue;
# fixes LookupError: unknown encoding: idna
import encodings, encodings.idna, encodings.ascii;
#############################################################################
## Globals ###
#############################################################################
def printDebug(stuff):
sys.stderr.write(stuff + '\n');
pass;
SYMBOL_STRIP = re.compile('[^a-zA-Z0-9 ]');
SINGLE_SPACE = re.compile('([^ ]+)');
NOBR = ' ';
def camelCase(phrase):
"""given any sentence, turns it into a Java-style camelCase name"""
phrase = SYMBOL_STRIP.sub('', phrase);
words = SINGLE_SPACE.findall(phrase);
(phrase, words) = (words[0].lower(), words[1:]);
for word in words:
phrase += word.capitalize();
return phrase;
def autoLineBreak(blurb):
"""given text with
s and
s, adds sane linebreaks"""
blurb = blurb.replace('\n', '');
blurb = blurb.replace('
', '
\n');
#blurb = blurb.replace('
', '
\n');
blurb = blurb.replace('', '\n
');
blurb = blurb.replace('
', '\n');
return blurb;
#############################################################################
## JournalEntry ###
#############################################################################
class JournalEntry:
path = "./"; # path to file output
#prefix = "temp"
suffix = ".txt";
numFormat = "%08d";
DIGEST_LEN = 256; # the max length of an entry summary
HTML_STRIP = re.compile('<[^>]+>');
def __init__(self, username, id):
self.user = username;
self.title = '';
self.date = '';
self.id = int(id);
self.body = '';
self.comments = [];
def setTitle(self, title):
self.title = title;
def getTitle(self):
return self.title;
def setDate(self, date):
self.date = date;
def getDate(self):
return self.date;
def setID(self, id):
self.id = int(id);
def getID(self):
return self.id;
def setBody(self, body):
self.body = body.replace('', '').replace(NOBR, '');
def getBody(self):
return self.body;
def addBody(self, body):
self.body += body.replace('', '').replace(NOBR, '');
def addComment(self, comment):
self.comments += [comment];
def getComments(self):
return self.comments;
def __str__(self):
comments = [str(comment) for comment in self.comments];
entry = (self.title, '\n', self.date, '\n\n', autoLineBreak(self.body), '\n\n', '\n\n'.join(comments));
return ''.join(entry);
""" creates an HTML fragment for a table of contents """
def getIndexItem(self):
# create a summary of the entry
digest = self.body[:self.DIGEST_LEN].replace('\n', '');
digest = self.HTML_STRIP.sub('', digest);
digest = digest.replace('>', '').replace('<', '');
tokens = (self.user, self.id, self.title, digest);
return "%s: %s" % tokens;
""" writes entry to disk """
def write(self):
suffix = '_' + camelCase(self.title) + str(self.suffix);
f = open(self.path + str(self.numFormat % self.id) + suffix, 'w');
f.write(str(self));
f.close();
#############################################################################
## JournalComment ###
#############################################################################
class JournalComment:
def __init__(self, subject):
self.subject = subject;
self.user = '';
self.id = 0;
self.date = '';
self.body = '';
def getSubject(self):
return self.subject;
def setUser(self, user):
self.user = user;
def getUser(self):
return self.user;
def setID(self, id):
self.id = int(id);
def getID(self):
return self.id;
def setDate(self, date):
self.date = date;
def getDate(self):
return date;
def setBody(self, body):
self.body = body.replace('\t\t\t', '').replace(NOBR, '');
def getBody(self):
return self.body;
def addBody(self, body):
self.body += body.replace('\t\t\t', '').replace(NOBR, '');
def __str__(self):
return ''.join(
['Subject: ', self.subject, '\n',
'From: ', self.user, '\n',
'\n',
autoLineBreak(self.body), '\n',
'\t***']
);
#############################################################################
## Grabber ###
#############################################################################
class Grabber(threading.Thread):
#
SLEEPTIME = 0.5; # seconds
NEXT_PREFIX = 'http://slashdot.org/journal.pl?op=display&uid=';
MAX_QUEUE_SIZE = 0; #infinite
# valid instance states
STATE_GREY = 'neutral grey';
STATE_TITLE = 'title';
STATE_DATE = 'date';
STATE_BODY = 'body';
STATE_TAIL = 'tail';
STATE_COM_TITLE = 'ctitle';
STATE_COM_MAKER = 'cmaker';
STATE_COM_DATE = 'cdate';
STATE_COM_TEXT = 'cbody';
# helper regex for transitions between states
REGEX_TITLE = re.compile('^ (.*)
');
REGEX_DATE = re.compile('\s*(.*@.*)');
REGEX_BODY_OPEN = re.compile('^\s*(.*)(
)?');
REGEX_BODY_CLOSE = re.compile('^(.*)\s*');
REGEX_COM_TITLE = re.compile('^\s*');
REGEX_COM_MAKER = re.compile('^\s*by (.+) \([0-9]+\)')
REGEX_COM_DATE = re.compile('^\s*on (.+) \(');
REGEX_COM_TEXT = re.compile('^\s*');
REGEX_COM_END = re.compile('^\s*
');
REGEX_UID = None;
REGEX_LINK = re.compile('^\s*');
def makeUserURL(self, username):
return 'http://slashdot.org/~' + username + '/';
def makeListURL(self, userid):
return 'http://slashdot.org/journal.pl?op=list&uid=' + userid;
def makeEntryURL(self, username, id):
return 'http://slashdot.org/~' + username + '/journal/' + id + '?threshold=-1&mode=nested&commentsort=0';
def __init__(self, username):
threading.Thread.__init__(self);
from urllib import quote;
self.done = False;
self.state = self.STATE_GREY;
self.user = quote(username);
self.uid = None;
self.url = self.makeUserURL(self.user);
self.REGEX_UID = re.compile("^ %s \(([0-9]+)\)" % username, re.IGNORECASE);
# Current journal entry
self.queue = Queue.Queue(self.MAX_QUEUE_SIZE);
self.entry = None;
self.comment = None;
# List of journals
self.journalIDs = [];
def run(self):
url = self.url;
from urllib2 import urlopen;
# get the user id
for line in urlopen(url):
m = self.REGEX_UID.match(line);
if m != None:
self.uid = m.group(1);
break;
sleep(self.SLEEPTIME);
# can't continue without user id
if self.uid == None:
return;
# get the journal list
url = self.makeListURL(self.uid);
for line in urlopen(url):
m = self.REGEX_LINK.match(line);
if m != None:
self.journalIDs += [m.group(1)];
sleep(self.SLEEPTIME);
# get the journal entries
for id in self.journalIDs:
# create the skeleton entry
self.entry = JournalEntry(self.user, id);
# fill in the blanks
url = self.makeEntryURL(self.user, id);
print url;
for line in urlopen(url):
# if just starting
if self.state == self.STATE_GREY:
# if a journal title
m = self.REGEX_TITLE.match(line);
if m != None:
self.entry.setTitle(m.group(1));
self.state = self.STATE_TITLE;
continue;
# if already grabbed a title
elif self.state == self.STATE_TITLE:
m = self.REGEX_DATE.match(line);
if m != None:
self.entry.setDate(m.group(1));
self.state = self.STATE_DATE;
continue;
# if already grabbed a date
elif self.state == self.STATE_DATE:
m = self.REGEX_BODY_OPEN.match(line);
if m != None:
self.state = self.STATE_BODY;
self.entry.addBody(m.group(1));
continue;
# if currently grabbing the body
elif self.state == self.STATE_BODY:
m = self.REGEX_BODY_CLOSE.match(line);
# if the body has ended
if m != None:
self.entry.addBody(m.group(1));
self.state = self.STATE_TAIL;
continue;
else:
self.entry.addBody(line);
continue;
# once in comment mode
elif self.state == self.STATE_TAIL:
# find reply subject
m = self.REGEX_COM_TITLE.match(line);
if m != None:
self.comment = JournalComment(m.group(1));
self.state = self.STATE_COM_TITLE;
# found reply subject, looking for person's name
elif self.state == self.STATE_COM_TITLE:
# find responder's name
m = self.REGEX_COM_MAKER.match(line);
if m != None:
self.comment.setUser(m.group(1));
self.state = self.STATE_COM_MAKER;
# found person's name, looking for date
elif self.state == self.STATE_COM_MAKER:
# find date
m = self.REGEX_COM_DATE.match(line);
if m != None:
self.comment.setDate(m.group(1));
self.state = self.STATE_COM_DATE;
# found comment datetime, looking for comment text start
elif self.state == self.STATE_COM_DATE:
# find start of comment body
m = self.REGEX_COM_TEXT.match(line);
if m != None:
self.state = self.STATE_COM_TEXT;
# eat up comment body text
elif self.state == self.STATE_COM_TEXT:
# unless comment is over
m = self.REGEX_COM_END.match(line);
if m != None:
self.entry.addComment(self.comment);
self.comment = None;
self.state = self.STATE_TAIL;
else:
self.comment.addBody(line);
# having parsed every line
if self.comment != None:
self.entry.addComment(self.comment);
self.comment = None;
self.entry.write();
self.queue.put(self.entry);
self.entry = None;
self.state = self.STATE_GREY;
# pause between journals
sleep(self.SLEEPTIME);
self.done = True;
#############################################################################
## Main ###
#############################################################################
if __name__ == "__main__":
# Whom to parse
if len(sys.argv) > 1:
username = '' + sys.argv[1];
for word in sys.argv[2:]:
username += ' ' + word;
else:
sys.stderr.write('Usage: Grabber [username]\n');
sys.exit(-1);
# Parse the whom
g = Grabber(username);
g.start();
while not (g.done and g.queue.empty()):
try:
je = g.queue.get(False);
except Queue.Empty:
continue;
print je.date;
g.join();
"""
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
"""
|