#!/usr/local/bin/clisp -ansi -q -E utf-8 ;;;; -*- mode:lisp; coding:utf-8 -*- ;;;;************************************************************************** ;;;;FILE: rss2email ;;;;LANGUAGE: Common-Lisp ;;;;SYSTEM: Common-Lisp ;;;;USER-INTERFACE: NONE ;;;;DESCRIPTION ;;;; ;;;; rss2email: get RSS feeds emailed to you. ;;;; ;;;; Usage: ;;;; new [youremail] (create new feedfile) ;;;; email [yournewemail] (update default email) ;;;; run [--no-send] [num] ;;;; add feedurl [youremail] ;;;; list ;;;; delete n ;;;; ;;;;AUTHORS ;;;; <PJB> Pascal Bourguignon <pjb@informatimago.com> ;;;;MODIFICATIONS ;;;; 2006-11-26 <PJB> Converted from python by Aaron Swartz. ;;;;BUGS ;;;;LEGAL ;;;; GPL ;;;; ;;;; Copyright Pascal Bourguignon 2006 - 2006 ;;;; ;;;; This program is free software; you can redistribute it and/or ;;;; modify it under the terms of the GNU General Public License ;;;; as published by the Free Software Foundation; either version ;;;; 2 of the License, or (at your option) any later version. ;;;; ;;;; This program is distributed in the hope that it will be ;;;; useful, but WITHOUT ANY WARRANTY; without even the implied ;;;; warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;;;; PURPOSE. See the GNU General Public License for more details. ;;;; ;;;; You should have received a copy of the GNU General Public ;;;; License along with this program; if not, write to the Free ;;;; Software Foundation, Inc., 59 Temple Place, Suite 330, ;;;; Boston, MA 02111-1307 USA ;;;;************************************************************************** ;;;;;; Vaguely Customizable Options ;;;;;; (defvar *DEFAULT-FROM* "rss2email@localhost" "The email address messages are from by default.") (defvar *HTML-MAIL* :CONVERT " :HTML Send text/html messages when possible. :CONVERT Convert HTML to plain text. ") (defvar *FORCE-FROM* nil " T Only use the *DEFAULT-FROM* address. NIL Use the email address specified by the feed, when possible. ") (defvar *TRUST-GUID* T " T Receive one email per post. NIL Receive an email every time a post changes. ") (defvar *DATE-HEADER* :ITEM " :ITEM Generate Date header based on item's date, when possible. :SENT Generate Date header based on time sent. ") (defvar *DATE-HEADER-ORDER* '(:modified :issued :created) " A list of (member :issued :created :modified :expired) giving the order of preference in dates to use for the Date header of the email. ") (defvar *QP-REQUIRED* NIL " T Apply Q-P conversion (required for some MUAs). NIL Send message in 8-bits. http://cr.yp.to/smtp/8bitmime.html ") (defvar *VERBOSE* T " T Name feeds as they're being processed. NIL Keep quiet. ") (defvar *USE-PUBLISHER-EMAIL* T " T Use the publisher's email if you can't find the author's. NIL Just use the *DEFAULT-FROM* email instead. ") (defvar *SMTP-SEND* NIL " T Use *SMTP-SERVER* to send mail. NIL Call /usr/bin/sendmail to send mail. ") (defvar *SMTP-SERVER* "localhost:25") (defvar *BONUS-HEADER* '() " Set this to add a bonus header to all emails (start with '\n'). Example: (setf *BONUS-HEADER* '(\"Approved: joe@bob.org\" \"Errors-To: joe@bob.org\")) ") (defvar *OVERRIDE-FROM* '() " Set this a-list to override From addresses. Keys are feed URLs, values are new titles. ") (defvar *sendmail-program* "/usr/sbin/sendmail" "Path to the sendmail program.") (defun hostname () (with-open-stream (input (ext:run-program "hostname" :arguments '("-f") :output :stream)) (or (read-line input nil nil) (short-site-name)))) (defun sendmail (sender recipients message) (with-open-stream (smtp (let* ((colon (position #\: *smtp-server*)) (host (if colon (subseq *smtp-server* colon) *smtp-server*)) (port (if colon (parse-integer *smtp-server* :start (1+ colon)) 25))) (socket:socket-connect port host :ELEMENT-TYPE 'character :EXTERNAL-FORMAT (ext:make-encoding :charset charset:utf-8 :line-terminator :dos) :BUFFERED nil :TIMEOUT 30))) (flet ((go-on () (<= 200 (read-from-string (or (read-line smtp nil nil) "400")) 399))) (format smtp "HELO ~A~%" (hostname)) (when (go-on) (format smtp "MAIL FROM: <~A>~%" sender) (when (and (go-on) (some (lambda (recipient) (format smtp "RCPT TO: <~A>~%" recipient) (go-on)) recipients)) (format smtp "DATA~%") (when (go-on) (format smtp "~A~%.~%" message)))) (format smtp "QUIT~%") (sleep 3)))) (defun send (from to message) ;; Note: You can also override the send function. (if *smtp-send* (sendmail from (list to) message) (with-open-stream (sendmail (ext:run-program *sendmail-program* :arguments (LIST "-bm" "-B" "8BITMIME" "-f" from to) :input :stream :output nil)) (princ message sendmail)))) ;;;; html2text options ;;;; (defvar *UNICODE-SNOB* T " Use Unicode characters instead of their ascii pseudo-replacements. ") (defvar *LINKS-EACH-PARAGRAPH* nil " Put the links after each paragraph instead of at the end. ") (defvar *BODY-WIDTH* nil " Wrap long lines at position. NIL for no wrapping. ") ;;;;;; Load the Options ;;;;;; ;; Read options from config file if present. import sys sys.path.append(".") try: from config import * except: pass ;;;;;; Import Modules ;;;;;; import cPickle as pickle, fcntl, md5, time, os, traceback, urllib2, sys, types import socket; socket_errors = [] for e in ['error', 'gaierror']: if hasattr(socket, e): socket_errors.append(getattr(socket, e)) import mimify; from StringIO import StringIO as SIO; mimify.CHARSET = 'utf-8' if *SMTP-SEND*: import smtplib; smtpserver = smtplib.SMTP(*SMTP-SERVER*) else: smtpserver = None import feedparser feedparser.USER_AGENT = "rss2email/"+__version__+ " +http://www.aaronsw.com/2002/rss2email/" import html2text as h2t h2t.*UNICODE-SNOB* = *UNICODE-SNOB* h2t.*LINKS-EACH-PARAGRAPH* = *LINKS-EACH-PARAGRAPH* h2t.*BODY-WIDTH* = *BODY-WIDTH* html2text = h2t.html2text ;;;;;; Utility Functions ;;;;;; warn = sys.stderr def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u'')) def ishtml(t): return type(t) is type(()) def contains(a,b): return a.find(b) != -1 def unu(s): ;; I / freakin' hate / that unicode if type(s) is types.UnicodeType: return s.encode('utf-8') else: return s def quote822(s): """Quote names in email according to RFC822.""" return '"' + unu(s).replace("\\", "\\\\").replace('"', '\\"') + '"' def header7bit(s): """QP_CORRUPT headers.""" return mimify.mime_encode_header(s + ' ')[:-1] ;;;;;; Parsing Utilities ;;;;;; def getContent(entry, HTMLOK=0): """Select the best content from an entry, deHTMLizing if necessary. If raw HTML is best, an ('HTML', best) tuple is returned. """ ;; How this works: ;; * We have a bunch of potential contents. ;; * We go thru looking for our first choice. ;; (HTML or text, depending on HTMLOK) ;; * If that doesn't work, we go thru looking for our second choice. ;; * If that still doesn't work, we just take the first one. ;; ;; Possible future improvement: ;; * Instead of just taking the first one ;; pick the one in the "best" language. ;; * HACK: hardcoded HTMLOK, should take a tuple of media types conts = entry.get('content', []) if entry.get('summary_detail', {}): conts += [entry.summary_detail] if conts: if HTMLOK: for c in conts: if contains(c.type, 'html'): return ('HTML', c.value) for c in conts: if c.type == 'text/plain': return c.value if not HTMLOK: ;; Only need to convert to text if HTML isn't OK for c in conts: if contains(c.type, 'html'): return html2text(c.value) return conts[0].value return "" def getID(entry): """Get best ID from an entry.""" if *TRUST-GUID*: if 'id' in entry and entry.id: return entry.id content = getContent(entry) if content: return md5.new(unu(content)).hexdigest() if 'link' in entry: return entry.link if 'title' in entry: return md5.new(unu(entry.title)).hexdigest() def getName(r, entry): """Get the best name.""" feed = r.feed if r.url in *OVERRIDE-FROM*.keys(): return unu(*OVERRIDE-FROM*[r.url]) name = feed.get('title', '') if 'name' in entry.get('author_detail', []): ;; normally {} but py2.1 if entry.author_detail.name: if name: name += ", " name += entry.author_detail.name elif 'name' in feed.get('author_detail', []): if feed.author_detail.name: if name: name += ", " name += feed.author_detail.name return name def getEmail(feed, entry): """Get the best email_address.""" if *FORCE-FROM*: return *DEFAULT-FROM* if 'email' in entry.get('author_detail', []): return entry.author_detail.email if 'email' in feed.get('author_detail', []): return feed.author_detail.email ;;TODO: contributors if *USE-PUBLISHER-EMAIL*: if 'email' in feed.get('publisher_detail', []): return feed.publisher_detail.email if feed.get("errorreportsto", ''): return feed.errorreportsto return *DEFAULT-FROM* ;;;;;; Simple Database of Feeds ;;;;;; class Feed: def __init__(self, url, to): self.url, self.etag, self.modified, self.seen = url, None, None, {} self.to = to def load(lock=1): feedfileObject = open(feedfile, 'r') feeds = pickle.load(feedfileObject) if lock: fcntl.flock(feedfileObject.fileno(), fcntl.LOCK_EX) ;;HACK: to deal with lock caching feedfileObject = open(feedfile, 'r') feeds = pickle.load(feedfileObject) fcntl.flock(feedfileObject.fileno(), fcntl.LOCK_EX) return feeds, feedfileObject def unlock(feeds, feedfileObject): pickle.dump(feeds, open(feedfile+'.tmp', 'w')) os.rename(feedfile+'.tmp', feedfile) fcntl.flock(feedfileObject.fileno(), fcntl.LOCK_UN) ;;;;;; Program Functions ;;;;;; def add(*args): if len(args) == 2 and contains(args[1], '@') and not contains(args[1], '://'): urls, to = [args[0]], args[1] else: urls, to = args, None feeds, feedfileObject = load() if feeds and not isstr(feeds[0]) and to is None: raise 'NoEmail', "Run `email newaddr` or `add url addr`." for url in urls: feeds.append(Feed(url, to)) unlock(feeds, feedfileObject) def run(num=None): feeds, feedfileObject = load() try: ;; We store the default to address as the first item in the feeds list. ;; Here we take it out and save it for later. if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] else: ifeeds = feeds if num: ifeeds = [feeds[num]] for f in ifeeds: try: if *VERBOSE*: print >>warn, "I: Processing", f.url r = feedparser.parse(f.url, f.etag, f.modified) ;; Handle various status conditions, as required if 'status' in r: if r.status == 301: f.url = r['url'] elif r.status == 410: print >>warn, "W: feed gone; deleting", f.url feeds.remove(f) continue http_status = r.get('status', 200) http_headers = r.get('headers', { 'content-type': 'application/rss+xml', 'content-length':'1'}) exc_type = r.get("bozo_exception", Exception()).__class__ if http_status != 304 and not r.entries and not r.get('version', ''): if http_status not in [200, 302]: print >>warn, "W: error", http_status, f.url elif contains(http_headers.get('content-type', 'rss'), 'html'): print >>warn, "W: looks like HTML", f.url elif http_headers.get('content-length', '1') == '0': print >>warn, "W: empty page", f.url elif hasattr(socket, 'timeout') and exc_type == socket.timeout: print >>warn, "W: timed out on", f.url elif exc_type == IOError: print >>warn, "W:", r.bozo_exception, f.url elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error: print >>warn, "W: broken compression", f.url elif exc_type in socket_errors: exc_reason = r.bozo_exception.args[1] print >>warn, "W:", exc_reason, f.url elif exc_type == urllib2.URLError: if r.bozo_exception.reason.__class__ in socket_errors: exc_reason = r.bozo_exception.reason.args[1] else: exc_reason = r.bozo_exception.reason print >>warn, "W:", exc_reason, f.url elif exc_type == KeyboardInterrupt: raise r.bozo_exception else: print >>warn, "=== SEND THE FOLLOWING TO rss2email@aaronsw.com ===" print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url print >>warn, r print >>warn, "rss2email", __version__ print >>warn, "feedparser", feedparser.__version__ print >>warn, "html2text", h2t.__version__ print >>warn, "Python", sys.version print >>warn, "=== END HERE ===" continue r.entries.reverse() for entry in r.entries: id = getID(entry) ;; If *TRUST-GUID* isn't set, we get back hashes of the content. ;; Instead of letting these run wild, we put them in context ;; by associating them with the actual ID (if it exists). frameid = entry.get('id', id) ;; If this item's ID is in our database ;; then it's already been sent ;; and we don't need to do anything more. if f.seen.has_key(frameid) and f.seen[frameid] == id: continue if 'title_detail' in entry and entry.title_detail: title = entry.title_detail.value if contains(entry.title_detail.type, 'html'): title = html2text(title) else: title = getContent(entry)[:70] title = unu(title).replace("\n", " ") datetime = time.gmtime() if *DATE-HEADER*: for datetype in *DATE-HEADER-ORDER*: kind = datetype+"_parsed" if kind in entry: datetime = entry[kind] content = getContent(entry, HTMLOK=*HTML-MAIL*) link = unu(entry.get('link', "")) from_addr = unu(getEmail(r.feed, entry)) message = ( "From: " + quote822(header7bit(getName(r, entry))) + " <"+from_addr+">" + "\nTo: " + header7bit(unu(f.to or default_to)) + ;; set a default email! "\nSubject: " + header7bit(title) + "\nDate: " + time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime) + "\nUser-Agent: rss2email" + ;; really should be X-Mailer *BONUS-HEADER* + "\nContent-Type: ") ;; but backwards-compatibility if ishtml(content): message += "text/html" content = ("<html><body>\n\n" + '<h1><a href="'+link+'">'+title+'</a></h1>\n\n' + unu(content[1]).strip() + ;; drop type tag (HACK: bad abstraction) '<p>URL: <a href="'+link+'">'+link+'</a></p>' + "\n\n</body></html>") else: message += "text/plain" content = unu(content).strip() + "\n\nURL: "+link message += '; charset="utf-8"\n\n' + content + "\n" if *QP-REQUIRED*: ins, outs = SIO(message), SIO() mimify.mimify(ins, outs) message = outs.getvalue() send(from_addr, (f.to or default_to), message) f.seen[frameid] = id f.etag, f.modified = r.get('etag', None), r.get('modified', None) except KeyboardInterrupt: raise except: print >>warn, "=== SEND THE FOLLOWING TO rss2email@aaronsw.com ===" print >>warn, "E: could not parse", f.url traceback.print_exc(file=warn) print >>warn, "rss2email", __version__ print >>warn, "feedparser", feedparser.__version__ print >>warn, "html2text", h2t.__version__ print >>warn, "Python", sys.version print >>warn, "=== END HERE ===" continue finally: unlock(feeds, feedfileObject) def list(): feeds, feedfileObject = load(lock=0) if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:]; i=1 print "default email:", default_to else: ifeeds = feeds; i = 0 for f in ifeeds: print `i`+':', f.url, '('+(f.to or ('default: '+default_to))+')' i+= 1 def delete(n): feeds, feedfileObject = load() feeds = feeds[:n] + feeds[n+1:] print >>warn, "W: feed IDs may have changed, list before deleting again" unlock(feeds, feedfileObject) def email(addr): feeds, feedfileObject = load() if feeds and isstr(feeds[0]): feeds[0] = addr else: feeds = [addr] + feeds unlock(feeds, feedfileObject) if __name__ == '__main__': ie, args = "InputError", sys.argv try: if len(args) < 3: raise ie, "insufficient args" feedfile, action, args = args[1], args[2], args[3:] if action == "run": if args and args[0] == "--no-send": def send(x,y,z): if *VERBOSE*: print 'Not sending', ( [x for x in z.splitlines() if x.startswith("Subject:")][0]) if args and args[-1].isdigit(): run(int(args[-1])) else: run() elif action == "email": email(args[0]) elif action == "add": add(*args) elif action == "new": if len(args) == 1: d = [args[0]] else: d = [] pickle.dump(d, open(feedfile, 'w')) elif action == "list": list() elif action == "delete": delete(int(args[0])) else: raise ie, "invalid action" if smtpserver: smtpserver.quit() except ie, e: print "E:", e print print __doc__