Examples
HTTP crawler
defmodule PragmaticBookshelfCrawler do
use Scrapex.Crawler
use Scrapex.Requester
import Meeseeks
import Meeseeks.XPath
start_url "https://pragprog.com/titles/category/gaming?f[sort_by]=pubdate&f[category]=all&f[skill_level]=All&f[title_contains]="
middleware :inspect
max_retries 0
crawl_queue_size 1
def start(_) do
page = request!(:get, start_url()).body
categories = all(page, xpath("//*[@id='title_filter_params_category']/option"))
Enum.each(categories, fn (elem) ->
yield(:parse_category, %{category_name: text(elem),
category_value: attr(elem, "value")})
end)
end
def parse_category(data) do
page = request!(:get, "https://pragprog.com/titles/category/gaming?f[sort_by]=pubdate&f[category]=#{data["category_value"]}&f[skill_level]=All&f[title_contains]=").body
books = all(page, xpath("//*[@id='filter-o-matic']/div/ul/li[not(contains(@style, 'display: none'))]/a"))
Enum.each(books, fn (elem) ->
yield(:parse_book, Map.merge(data, %{book_url: attr(elem, "href")}))
end)
end
def parse_book(data) do
page = request!(:get, data["book_url"]).body
send_data(%{
category: data["category_name"],
url: data["book_url"],
title: text(one(page, xpath("//h1/text()"))),
subtitle: text(one(page, xpath("//h2/text()"))),
author: text(one(page, xpath("//span[@itemprop='author']"))),
description: text(one(page, xpath("//article[@itemprop='description']/p"))),
image_url: attr(one(page, xpath("//a[img[@itemprop='image']]")), "href")
})
end
def inspect(data, opts \\ []) do
IO.inspect data
end
end
PhantomJS crawler
defmodule PragmaticBookshelfCrawler do
use Scrapex.Crawler
use Scrapex.Hound, browser: "phantomjs"
start_url "https://pragprog.com/titles/category/gaming?f[sort_by]=pubdate&f[category]=all&f[skill_level]=All&f[title_contains]="
middleware :inspect
before_parse_start do
delete_cookies()
end
crawl_queue_size 4
def start(_) do
navigate_to(start_url())
categories = find_all_elements(:xpath, "//*[@id='title_filter_params_category']/option")
Enum.each(categories, fn (elem) ->
yield(:parse_category, %{category_value: attribute_value(elem, "value"),
category_name: inner_text(elem)})
end)
end
def parse_category(data) do
navigate_to("https://pragprog.com/titles/category/gaming?f[sort_by]=pubdate&f[category]=#{data["category_value"]}&f[skill_level]=All&f[title_contains]=")
:timer.sleep(2000)
books = find_all_elements(:xpath, "//*[@id='filter-o-matic']/div/ul/li[not(contains(@style, 'display: none'))]/a")
Enum.each(books, fn (elem) ->
yield(:parse_book, Map.merge(data, %{book_url: attribute_value(elem, "href")}))
end)
end
def parse_book(data) do
navigate_to(data["book_url"])
send_data(%{
category: data["category_name"],
url: data["book_url"],
title: inner_text({:tag, "h1"}),
subtitle: inner_text({:tag, "h2"}),
author: inner_text({:xpath, "//span[@itemprop='author']"}),
description: inner_text({:xpath, "//article[@itemprop='description']/p"}),
image_url: attribute_value({:xpath, "//a[img[@itemprop='image']]"}, "href")
})
end
def inspect(data, opts \\ []) do
IO.inspect data
end
end