0

This is a program to parse some sites. The first site is site1. All the logic to parse that perticular site is located to (-> config :site1)

(ns program.core
    (require [net.cgrand.enlive-html :as html])) 

(def config 
    {:site1   
        {:site-url 
            ["http://www.site1.com/page/1"
             "http://www.site1.com/page/2"
                 "http://www.site1.com/page/3"
             "http://www.site1.com/page/4"]
        :url-encoding "iso-8859-1"
        :parsing-index
            {:date
                {:selector 
                    [[:td.PadMed (html/nth-of-type 1)] :table [:tr (html/nth-of-type 2)] 
                    [:td (html/nth-of-type 3)] [:span]]
                :trimming-fn
                    (comp first :content) ; (first) to remove extra parenthese
                }
            :title 
                {:selector
                    [[:td.PadMed (html/nth-of-type 1)] :table :tr [:td (html/nth-of-type 2)] [:a]]
                :trimming-fn
                    (comp first :content first :content)
                }
            :url 
                {:selector
                    [[:td.PadMed (html/nth-of-type 1)] :table :tr [:td (html/nth-of-type 2)] [:a]]
                :trimming-fn 
                    #(str "http://www.site.com" (:href (:attrs %)))
                }
            }
        }})
    ;=== Fetch fn ===;

    (defn fetch-encoded-url 
        ([url] (fetch-encoded-url url "utf-8"))
        ([url encoding] (-> url java.net.URL. 
                    .getContent 
                    (java.io.InputStreamReader. encoding)
                    html/html-resource)))

Now I want to parse the pages contained in (-> config :site1 :site-url) In this example I use only the first url, but how can i design this to actually do kind of a master for for all the URLs?

(defn parse-element [element]
    (into [] (map (-> config :site1 :parsing-index element :trimming-fn)
            (html/select 
              (fetch-encoded-url
                (-> config :site1 :site-url first)
                (-> config :site1 :url-encoding))
              (-> config :site1 :parsing-index element :selector)))))

(def element-lists
    (apply map vector
        (map parse-element (-> config :site1 :parsing-index keys))))

(def tagged-lists
    (into [] (for [element-list element-lists]
           (zipmap [:date :title :url] element-list))))

;==== Fn call ====
    (println tagged-lists)
leontalbot
  • 2,513
  • 1
  • 23
  • 32

1 Answers1

1

Pass :site1 as an argument to parse-element and elements-list.

(defn parse-element [site element]
    (into [] (map (-> config site :parsing-index element :trimming-fn)
        (html/select 
          (fetch-encoded-url
            (-> config site :site-url first)
            (-> config site :url-encoding))
          (-> config site :parsing-index element :selector)))))

(def element-lists [site]
    (apply map vector
        (map (partial parse-element site) (-> config site :parsing-index keys))))

And then map over :site1 :site2… keys.


Addendum in answer to the further question in the comments.

You could wrap the html/select in a map over the :site-urls. Something like:

(defn parse-element [site element]
  (let [site-urls (-> config site :site-url)]  
    (into [] (map (-> config site :parsing-index element :trimming-fn)
       map
        #(html/select 
          (fetch-encoded-url
            %
            (-> config site :url-encoding))
          (-> config site :parsing-index element :selector)))
        site-urls)))

(I hope I got the parens right.)

Then you'll probably need to check the :trimming-fn, in order for it to handle the nesting. An apply should suffice.

i-blis
  • 3,149
  • 24
  • 31
  • Great! And how can i do this, for multiple urls within a given site ? :site-url ["http://www.site1.com/page/1" "http://www.site1.com/page/2" "http://www.site1.com/page/3" "http://www.site1.com/page/4"] – leontalbot May 07 '13 at 01:52
  • You will need to rewrite `parse-element` to map over the `:site-url` vector instead of taking but its first element by wrapping the whole `html/select`. See my edit. – i-blis May 07 '13 at 02:13
  • @user1184248 Consider upvoting and/or accepting the answer if it fits your expectations. – i-blis May 07 '13 at 02:25