require 'rubygems' require 'hpricot' module Scrubyt ## #=Various XPath utility functions class XPathUtils #Find the LCA (Lowest Common Ancestor) of two nodes def self.lowest_common_ancestor(node1, node2) path1 = traverse_up(node1) path2 = traverse_up(node2) return node1.parent if path1 == path2 closure = nil while (!path1.empty? && !path2.empty?) closure = path1.pop return closure.parent if (closure != path2.pop) end path1.size > path2.size ? path1.last.parent : path2.last.parent end ## #Generate XPath for the given node # #*parameters* # #_node_ - The node we are looking up the XPath for # #_stopnode_ - The Xpath generation is stopped and the XPath that #was generated so far is returned if this node is reached. # #_write_indices_ - whether the index inside the parent shuold be #added, as in html[1]/body[1]/table[2]/tr[1]/td[8] def self.generate_XPath(node, stopnode=nil, write_indices=false) path = [] indices = [] found = false while !node.nil? && node.class != Hpricot::Doc do if node == stopnode found = true break end path.push node.name indices.push find_index(node) if write_indices node = node.parent end #This condition ensures that if there is a stopnode, and we did not found it along the way, #we return nil (since the stopnode is not contained in the path at all) return nil if stopnode != nil && !found result = "" if write_indices path.reverse.zip(indices.reverse).each { |node,index| result += "#{node}[#{index}]/" } else path.reverse.each{ |node| result += "#{node}/" } end "/" + result.chop end #Generate an XPath of the node with indices, relatively to the given #relative_root. # #For example if the elem's absolute XPath is /a/b/c, #and the relative root's Xpath is a/b, the result of the function will #be /c. def self.generate_relative_XPath( elem,relative_root ) return nil if (elem == relative_root) generate_XPath(elem, relative_root, true) end #Generate a generalized XPath (i.e. without indices) of the node, #relatively to the given relative_root. # #For example if the elem's absolute XPath is /a[1]/b[3]/c[5], #and the relative root's Xpath is a[1]/b[3], the result of the function will #be /c. def self.generate_generalized_relative_XPath( elem,relative_root ) return nil if (elem == relative_root) generate_XPath(elem, relative_root, false) end #Find an image based on the src of the example # #*parameters* # #_doc_ - The containing document # #_example_ - The value of the src attribute of the img tag #This is convenient, since if the users rigth-clicks an image and #copies image location, this string will be copied to the clipboard #and thus can be easily pasted as an examle # #_index_ - there might be more images with the same src on the page - #most typically the user will need the 0th - but if this is not the #case, there is the possibility to override this def self.find_image(doc, example, index=0) (doc/"//img[@src='#{example}']")[index] end ## #Used to find the parent of a node with the given name - for example #find the