;;;; -*- mode:lisp;coding:utf-8 -*-
;;;;**************************************************************************
;;;;FILE:               feed-to-full.lisp
;;;;LANGUAGE:           Common-Lisp
;;;;SYSTEM:             Common-Lisp
;;;;USER-INTERFACE:     NONE
;;;;DESCRIPTION
;;;;
;;;;    This script transforms a rss 2.0 feed.xml into a full feed
;;;;    (including the full article in the description).
;;;;
;;;;AUTHORS
;;;;    <PJB> Pascal J. Bourguignon <pjb@informatimago.com>
;;;;MODIFICATIONS
;;;;    2015-10-24 <PJB> Created.
;;;;BUGS
;;;;LEGAL
;;;;    AGPL3
;;;;
;;;;    Copyright Pascal J. Bourguignon 2015 - 2015
;;;;
;;;;    This program is free software: you can redistribute it and/or modify
;;;;    it under the terms of the GNU Affero General Public License as published by
;;;;    the Free Software Foundation, either version 3 of the License, or
;;;;    (at your option) any later version.
;;;;
;;;;    This program is distributed in the hope that it will be useful,
;;;;    but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;;    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;;;;    GNU Affero General Public License for more details.
;;;;
;;;;    You should have received a copy of the GNU Affero General Public License
;;;;    along with this program.  If not, see <http://www.gnu.org/licenses/>.
;;;;**************************************************************************
(in-package "COMMON-LISP-USER")

#+ccl (eval-when (:compile-toplevel :load-toplevel :execute)
        (setf ccl:*default-external-format*           :unix
              ccl:*default-file-character-encoding*   :utf-8
              ccl:*default-line-termination*          :unix
              ccl:*default-socket-character-encoding* :utf-8))

(defmacro redirecting-stdout-to-stderr (&body body)
  (let ((verror  (gensym))
        (voutput (gensym)))
    `(let* ((,verror  nil)
            (,voutput (with-output-to-string (stream)
                        (let ((*standard-output* stream)
                              (*error-output*    stream)
                              (*trace-output*    stream))
                          (handler-case (progn ,@body)
                            (error (err) (setf ,verror err)))))))
       (when ,verror
         (terpri *error-output*)
         (princ ,voutput *error-output*)
         (terpri *error-output*)
         (princ ,verror *error-output*)
         (terpri *error-output*)
         (terpri *error-output*)
         #+(and clisp (not testing-script)) (ext:exit 1)))))

(redirecting-stdout-to-stderr
  (load (merge-pathnames "quicklisp/setup.lisp" (user-homedir-pathname)))
  #+clisp (load (merge-pathnames "quicklisp/asdf.lisp"  (user-homedir-pathname))))


(redirecting-stdout-to-stderr
  #+clisp (ql:quickload "uiop")
  (ql:quickload "xmls")
  (ql:quickload "drakma")
  (ql:quickload "babel")
  (ql:quickload "com.informatimago.common-lisp"))

;;;------------------------------------------------------------------------

(defpackage "FEED-TO-FULL"
  (:use "COMMON-LISP"
        "XMLS"
        "DRAKMA"
        "BABEL"
        "COM.INFORMATIMAGO.COMMON-LISP.HTML-BASE.ML-SEXP"
        "COM.INFORMATIMAGO.COMMON-LISP.HTML-PARSER.PARSE-HTML"
        "COM.INFORMATIMAGO.COMMON-LISP.CESARUM.SEQUENCE")
  (:import-from  "COM.INFORMATIMAGO.COMMON-LISP.HTML-GENERATOR.HTML"
                 "WRITE-ESCAPING")
  (:export "MAIN" "FULLFEED"))
(in-package "FEED-TO-FULL")



(defun recode (string)
  (octets-to-string (string-to-octets string :encoding :iso-8859-1) :encoding :utf-8))

(defun get-resource (url)
  (multiple-value-bind (value status)
      (let ((*drakma-default-external-format* :utf-8))
        (http-request url :connection-timeout 10))
    (values (when (= 200 status)
              (find-if (lambda (entity) (and (listp entity) (eq :html (html-tag entity))))
                       (parse-html-string value)))
            status)))

(defun remove-comments (entity)
  (cond
    ((atom entity) entity)
    ((or (eq :comment (element-tag entity))
         (and (eq :div (element-tag entity))
              (member (value-of-attribute-named entity "CLASS")
                      '("TOP" "MENU" "BOTTOM")
                      :test (function string=))))
     nil)
    (t
     (make-element (element-tag entity)
                   (element-attributes entity)
                   (mapcan (lambda (child)
                             (let ((child (remove-comments child)))
                               (when child (list child))))
                           (element-children entity))))))

(defun unwrap (document)
  (let* ((html        document)
         (head        (child-tagged html :head))
         (title       (element-child (child-tagged head :title)))
         (author      (value-of-attribute-named (child-tagged-and-valued head :meta :name "author")      :content))
         (description (value-of-attribute-named (child-tagged-and-valued head :meta :name "description") :content))
         (keywords    (value-of-attribute-named (child-tagged-and-valued head :meta :name "keywords")    :content))
         (language    (or (value-of-attribute-named html :lang)
                          (value-of-attribute-named html :xml\:lang)
                          "en"))
         (class       "document")
         (document    (remove-comments (first (grandchildren-tagged-and-valued html "div" "class" class))))
         (id          (value-of-attribute-named document :id)))
    (declare (ignore id))
    (values title author description keywords language document)))

(defun split-email (email)
  "RETURN: name; address."
  ;; TODO: implement https://tools.ietf.org/html/rfc5322
  (let ((lp (position #\(  email)))
    ;; address@domain (name)
    (when lp
      (return-from split-email
        (values (subseq email (1+ lp) (position #\) email :start lp))
                (subseq email 0 (if (and (plusp lp)
                                         (char= #\space (aref email (1- lp))))
                                    (1- lp)
                                    lp))))))
  (let ((lt (position #\< email)))
    ;; name <address@domain>
    (when lt
      (return-from split-email
        (values (subseq email 0 (if (and (plusp lt)
                                         (char= #\space (aref email (1- lt))))
                                    (1- lt)
                                    lt))
                (subseq email (1+ lt) (position #\> email :start lt))))))
  (error "Unexpected email address format: ~S" email))

(defun iso8601-date (&key (time (get-universal-time)) (timezone 0 timezonep))
  (multiple-value-bind (se mi ho da mo ye dow dst tz)
      (if timezonep
          (decode-universal-time time timezone)
          (decode-universal-time time))
    (declare (ignore dow dst))
    (multiple-value-bind (tzh tzm) (truncate (abs tz))
      (format nil "~4,'0D-~2,'0D-~2,'0DT~2,'0D:~2,'0D:~2,'0D~:[+~;-~]~4,'0D"
              ye mo da ho mi se (plusp tz) (+ (* tzh 100) (truncate tzm 1/60))))))

(defun rfc822-date (&key (time (get-universal-time)) (timezone 0 timezonep))
  (multiple-value-bind (se mi ho da mo ye dow dst tz)
      (if timezonep
          (decode-universal-time time timezone)
          (decode-universal-time time))
    (declare (ignore dst))
    (multiple-value-bind (tzh tzm) (truncate (abs tz))
      (format nil "~[Mon~;Tue~;Wed~;Thi~;Fri~;Sat~;Sun~], ~2,'0D ~
                   ~[~;Jan~;Feb~;Mar~;Apr~;May~;Jun~;Jul~;Aug~;Sep~;Oct~;Nov~;Dec~] ~
                   ~4,'0D ~2,'0D:~2,'0D:~2,'0D ~:[+~;-~]~4,'0D"
              dow da mo ye ho mi se (plusp tz) (+ (* tzh 100) (truncate tzm 1/60))))))

(defun parse-rfc822-date (date)
  (with-standard-io-syntax
    (let ((*package* (load-time-value (find-package "KEYWORD"))))
      (with-input-from-string (in (substitute #\space #\, (substitute #\space #\: date)))
        (let ((dow (read in))
              (da (read in))
              (mo (let ((month (read in)))
                    (1+ (or (position month #(:jan :feb :mar :apr :may :jun
                                              :jul :aug :sep :oct :nov :dec))
                            (error "Invalid month ~S in date ~S" month date)))))
              (ye (read in))
              (ho (read in))
              (mi (read in))
              (se (read in))
              (tz (read in)))
          (declare (ignore dow))
          (check-type da (integer 1 31))
          (check-type ye (integer 1900 4000))
          (check-type ho (integer 0 23))
          (check-type mi (integer 0 59))
          (check-type se (integer 0 60))
          (check-type tz (integer -2400 +2400))
          (encode-universal-time se mi ho da mo ye
                                 (multiple-value-bind (tzh tzm) (truncate (abs tz) 100)
                                   (* (if (plusp tz)
                                          -1
                                          +1)
                                      (+ tzh (/ tzm 60))))))))))


(defun rfc822-to-iso8601 (date)
  (iso8601-date :time (parse-rfc822-date date)))

(defun test/split-email ()
  (assert (equal (multiple-value-list (split-email "Pascal Bourguignon <pjb@informatimago.com>"))
                 '("Pascal Bourguignon" "pjb@informatimago.com")))
  (assert (equal (multiple-value-list (split-email "pjb@informatimago.com (Pascal Bourguignon)"))
                 '("Pascal Bourguignon" "pjb@informatimago.com")))
  :success)

(defun full-item (item version)
  (let* ((blog-desc   (xmlrep-string-child (xmlrep-find-child-tag "description" item)))
         (link        (xmlrep-string-child (xmlrep-find-child-tag "link"        item)))
         (guid        (xmlrep-string-child (xmlrep-find-child-tag "guid"        item)))
         (author      (xmlrep-string-child (xmlrep-find-child-tag "author"      item)))
         (title       (xmlrep-string-child (xmlrep-find-child-tag "title"       item)))
         (category    (xmlrep-string-child (xmlrep-find-child-tag "category"    item)))
         (pubDate     (xmlrep-string-child (xmlrep-find-child-tag "pubDate"     item)))
         (image-url   (xmlrep-attrib-value "url" (xmlrep-find-child-tag "enclosure" item)))
         (article     (get-resource link)))
    (multiple-value-bind (article-title article-author description keywords language document) (unwrap article)
      (declare (ignore article-title article-author description))
      (multiple-value-bind (author-name author-email) (split-email author)
        (let ((content
                (with-output-to-string (out)
                                    (unparse-html
                                     (remove-comments
                                      `(:div (:class "blog")
                                         (:div (:class "header")
                                           (:img (:src ,image-url :height "100px" :class "floatRight"))
                                           (:h1 () (:a (:href ,link) ,title))
                                           (:p () ,blog-desc)
                                           (:p () (:small ()
                                                    (:a (:href ,(format nil "mailto:~A" author-email))
                                                      ,author-name)))
                                           (:p () (:small () ,keywords)))
                                         ,document))
                                     out))
                ;; (with-output-to-string (out)
                ;;   (write-escaping '((#\& . "&amp;")
                ;;                     (#\< . "&lt;")
                ;;                     (#\> . "&gt;")
                ;;                     (#\" . "&quot;"))
                ;;
                ;;                   out))
                ))
          (ecase version
            ((2)
             `("item" ()
                     ("dc:identifier" () ,guid)
                     ("dc:creator"    () ,author)
                     ("dc:language"   () ,language)
                     ("dc:format"     () "text/html")
                     ("title"         () ,title)
                     ("link"          () ,link)
                     ("guid"          () ,guid)
                     ("category"      () ,category)
                     ("pubDate"       () ,pubDate)
                     ("description"   () ,content)))
            ((1)
             `("entry" nil
                      ("id"        () ,guid)
                      ("published" () ,(rfc822-to-iso8601 pubDate))
                      ("updated"   () ,(iso8601-date))
                      ("title"   (("type" "text")) ,title)
                      ("content" (("type" "html")) ,content)
                      ("link" (("rel" "self")
                              ("type" "application/atom+xml")
                               ("href" ,link)))
                      ("author" ()
                               ("name" () ,author-name)
                               ("email" () ,author-email))))))))))


(defun string-child-tagged (tag element)
  (or (ignore-errors (xmlrep-string-child (xmlrep-find-child-tag tag element)))
      ""))

(defun write-full-feed (xml version stream)
  (write-line "<?xml version=\"1.0\" encoding=\"utf-8\"?>" stream)
  (ecase version
    ((2) (write-line "<?xml-stylesheet type=\"text/xsl\" href=\"css/feed.xsl\" ?>" stream))
    ((1) (write-line "<?xml-stylesheet href=\"http://www.blogger.com/styles/atom.css\" type=\"text/css\" ?>" stream)))
  (write-xml xml stream :indent nil))

(defun full-channel (channel version)
  (let* ((channel        (list* (car channel) (cadr channel)
                                (remove-if (function consp) (cddr channel)
                                           :key (function first))))
         (title          (string-child-tagged "title"          channel))
         (link           (string-child-tagged "link"           channel))
         (category       (string-child-tagged "category"       channel))
         (description    (string-child-tagged "description"    channel))
         (copyright      (string-child-tagged "copyright"      channel))
         (language       (string-child-tagged "language"       channel))
         (ttl            (string-child-tagged "ttl"            channel))
         (pubDate        (string-child-tagged "pubDate"        channel))
         (managingEditor (string-child-tagged "managingEditor" channel))
         (webMaster      (string-child-tagged "webMaster"      channel))
         (image          (xmlrep-find-child-tag "image"        channel))
         (items          (xmlrep-find-child-tags "item"        channel))
         (fulllink       (let ((pos (position #\/ link :from-end t)))
                           (replace-subseq "full" (copy-seq link) pos pos))))
    (declare (ignore language))
    (multiple-value-bind (author-name author-email) (split-email managingEditor)
      (ecase version
        ((2)
         `("rss" (("version"       "2.0")
                  ("xmlns:atom"    "http://www.w3.org/2005/Atom")
                  ("xmlns:content" "http://purl.org/rss/1.0/modules/content/")
                  ("xmlns:dc"      "http://purl.org/dc/elements/1.1/")
                  ("xmlns:media"   "http://search.yahoo.com/mrss/")
                  ("xmlns:og"      "http://ogp.me/ns#"))
                 ("channel" nil
                            ("atom:link" (("rel" "self")                             ("href" ,fulllink)))
                            ("atom:link" (("rel" "alternate") ("title" "Source URL") ("href" ,link)))
                            ("title"          () ,title)
                            ("category"       () ,category)
                            ("copyright"      () ,copyright)
                            ("pubDate"        () ,pubDate)
                            ("lastBuildDate"  () ,(rfc822-date))
                            ("managingEditor" () ,managingEditor)
                            ("webMaster"      () ,webMaster)
                            ("link"           () ,link)
                            ("ttl"            () ,ttl)
                            ("description"    () ,description)
                            ,image
                            ,@(mapcar (lambda (item) (full-item item version)) items))))

        ((1)
         `("feed" (("xmlns" "http://www.w3.org/2005/Atom")
                   ("xmlns:openSearch" "http://a9.com/-/spec/opensearchrss/1.0/")
                   ("xmlns:blogger" "http://schemas.google.com/blogger/2008")
                   ("xmlns:georss" "http://www.georss.org/georss")
                   ("xmlns:gd" "http://schemas.google.com/g/2005")
                   ("xmlns:thr" "http://purl.org/syndication/thread/1.0"))
                  ("updated" () ,(rfc822-date))
                  ("category"  (("term" ,category)))
                  ("title"     (("type" "text")) ,title)
                  ("link"      (("href" ,link)
                               ("type" "application/atom+xml")
                               ("rel" "self")))
                  ("author"    ()
                              ("name" () ,author-name)
                              ("email" () ,author-email))
                  ("openSearch:totalResults" () ,(prin1-to-string (length items)))
                  ("openSearch:itemsPerPage" () ,(prin1-to-string (length items)))
                  ("openSearch:startIndex" () "1")
                  ,@(mapcar (lambda (item) (full-item item version)) items)))))))

(defun fullfeed (input output)
  (let* ((feed           (parse input))
         (channel        (xmlrep-find-child-tag "channel" feed)))
    (write-full-feed (full-channel channel 1) 1 output)))


(defun main (&optional arguments)
  (unless (and (= 1 (length arguments))
               (probe-file (first arguments)))
    (error "This script requires one rss 2.0 xml file pathname argument."))
  (with-open-file (input (first arguments))
    (fullfeed input *standard-output*)))

#+(and clisp (not testing-script))
(progn
  (main ext:*args*)
  (ext:exit 0))

#+testing-script (setf *default-pathname-defaults* #P"/Users/pjb/public_html/sites/com.informatimago.www/blog/")

(with-open-file (input "feed.xml")
  (with-open-file (output "fullfeed.xml"
                          :direction :output
                          :if-does-not-exist :create
                          :if-exists :supersede)
    (fullfeed input output)))



(pushnew :testing-script *features*)
#-(and) (progn


          (fullfeed #P"~/public_html/sites/com.informatimago.www/blog/feed.xml")

          )


;;;; THE END ;;;;
ViewGit