ToxicCode     =Home= / =Config Files= / =Shell Scripts= / =Articles= / =Text Files= / =Misc= /

Register | Login | Submit new shell script

Title: wikiscraper
Submitted by: Fingel on 2009-03-16
Function: Search wikipedia and return results + overview

#!/usr/bin/env ruby
# Wikiscraper
# written by mentallaxative
# 

require 'rubygems'
require 'hpricot'
require 'open-uri'

class Wikiscraper

  def initialize(arguments)
    #ARGV needs to be cleared else it ends up in 'gets'
    @keyword = arguments.join(" ")
    substitutions = {' ' => '+', '(' => '%28', ')' => '%29'}
    substitutions.each_pair {|a,b| @keyword.gsub!(a, b) }
    arguments.clear
    
    @search_links = []
  end

  def main
    process_search
    populate_arrays
    get_selection
    final_page_processing
  end

  #open our search
  def process_search
    search_uri = "http://en.wikipedia.org/wiki/Special:Search?search=#{@keyword}&fulltext=Search"
    @s_page = Hpricot(open(search_uri))
  end

  def populate_arrays
    #results_data contains all search result names
    results_data = (@s_page/"ul.mw-search-results/li/:not(div.mw-search-result-data)").inner_text

    #search_links is an array of links of search results
    (@s_page/"ul.mw-search-results"/:a).each do |ah|
      @search_links << ah.attributes['href'].sub!('/wiki/', 'http://en.wikipedia.org/wiki/') 
    end

    s_entries = []
    results_data.each_line {|x| s_entries << "#{x}\r"}
    #print out the search results
    s_entries.each_index {|x| print "#{x+1}: #{s_entries[x]}"}
  end

  def get_selection
    toc_string = ''

    puts "\nType in a entry number:"
    @number = gets.chomp!.to_i-1
    selection = Hpricot(open(@search_links[@number]))

    #gets rid of everything after the table of contents
    if selection.to_s.index(toc_string) == nil
      @wiki_results = selection
    else
      @wiki_results = Hpricot(selection.to_s.slice(1..selection.to_s.index(toc_string)))
    end
  end

  def final_page_processing
    (@wiki_results/"table.infobox").remove
    @text_body = (@wiki_results/"#bodyContent/p/:not(#coordinates)")

    no_article_found = (@wiki_results/"div.noarticletext")

    if not no_article_found.empty?
      puts no_article_found.inner_text
    else
      puts((@wiki_results/"div.dablink").inner_text) #disambig info
      puts "\n"
      #get rid of citation marks like [1], [3], [12]
      @text_body.inner_text.gsub(/\[[\w\d]{0,3}\]/, "").each {|c| puts c}
    end
  end

end

if ARGV.empty?
  puts "Please supply a subject to search Wikipedia for."
else
  scrape = Wikiscraper.new(ARGV)
  scrape.main
end


Comments



Nothing Copyrighted