module Scrubyt ## #=Rejecting result instances based on further rules # #The two most trivial problems with a set of rules is that they match either less #or more instances than we would like them to. Constraints are a way to remedy the second problem: #they serve as a tool to filter out some result instances based on rules. A typical #example: # #* *ensure_presence_of_ancestor_pattern* consider this model: # # ... # ... # # #If I attach the *ensure_presence_of_ancestor_pattern* to the pattern 'book' with values #'author' and 'title', only those books will be matched which have an author and a #title (i.e.the child patterns author and title must extract something). This is a way #to say 'a book MUST have an author and a title'. class Constraint #There are more possible ways of applying/checking constraints in the case of #ones that can not be checked in the context node (e.g. ensure_presence_of - #since it may require the evaluation of child patterns of the context pattern to #arbitray level) # #In such cases, the possibilities are: # #1) make a depth-first evaluation from the context pattern until the needed ancestor # pattern is evaluated. This can mess things up, since if any ancestor node uses # the sinks of predecessor(s) other than the context node, those need to be evaluated # too, and we may run into a cyclyc dependency or at least a complicated recursion # #2) Post processing - evaluate normally and throw out results which do not pass the # constraint # #2b) Do it on the XML level - most probably this solution will be implemented # Different constraint types CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN = 0 CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE = 1 CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE = 2 CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE = 3 CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE = 4 attr_reader :type, :target #Add 'ensure presence of ancestor pattern' constraint #If this type of constraint is added to a pattern, it must have an ancestor pattern #(child pattern, or child pattern of a child pattern, etc.) denoted by "ancestor" #'Has an ancestor pattern' means that the ancestor pattern actually extracts something #(just by looking at the wrapper model, the ancestor pattern is always present) #Note that from this type of constraint there is no 'ensure_absence' version, since #I could not think about an use case for that def self.add_ensure_presence_of_pattern(ancestor) Constraint.new(ancestor, CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN) end #Add 'ensure absence of attribute' constraint #If this type of constraint is added to a pattern, the HTML node it targets #must NOT have an attribute named "attribute_name" with the value "attribute_value" def self.add_ensure_absence_of_attribute(attribute_hash) Constraint.new(attribute_hash, CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE) end #Add 'ensure presence of attribute' constraint #If this type of constraint is added to a pattern, the HTML node it targets #must have an attribute named "attribute_name" with the value "attribute_value" def self.add_ensure_presence_of_attribute(attribute_hash) Constraint.new(attribute_hash, CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE) end #Add 'ensure absence of ancestor node' constraint #If this type of constraint is added to a pattern, the HTML node extracted by the pattern #must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'. # #"attributes" is an array of hashes, for example #[{'font' => 'red'}, {'href' => 'http://www.google.com'}] #in the case that more values have to be checked with the same key (e.g. 'class' => 'small' and ' #class' => 'wide' it has to be written as [{'class' => ['small','wide']}] # #"attributes" can be empty - in this case just the 'node_name' is checked def self.add_ensure_absence_of_ancestor_node(node_name, attributes) Constraint.new([node_name, attributes], CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE) end #Add 'ensure presence of ancestor node' constraint #If this type of constraint is added to a pattern, the HTML node extracted by the pattern #must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'. # #"attributes" is an array of hashes, for example #[{'font' => 'red'}, {'href' => 'http://www.google.com'}] #in the case that more values have to be checked with the same key (e.g. 'class' => 'small' and ' #class' => 'wide' it has to be written as [{'class' => ['small','wide']}] # #"attributes" can be empty - in this case just the 'node_name' is checked def self.add_ensure_presence_of_ancestor_node(node_name, attributes) Constraint.new([node_name, attributes], CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE) end #Evaluate the constraint; if this function returns true, #it means that the constraint passed, i.e. its filter will be added to the exctracted #content of the pattern def check(result) case @type #checked after evaluation, so here always return true when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN return true when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE attribute_present(result) when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE !attribute_present(result) when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE ancestor_node_present(result) when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE !ancestor_node_present(result) end end private #We would not like these to be called from outside def initialize(target, type) @target = target @type = type end #Implementation of the ancestor node presence test #Check the documentation of the add_ensure_presence_of_ancestor_node method #for further information on the result parameter def ancestor_node_present(result) found = false node_name = @target[0] node_attributes = @target[1] node_attributes.each do |pair| return true if !result.search("//#{node_name}[@#{pair[0]}='#{pair[1]}']").empty? end if node_attributes.empty? return true if !result.search("//#{node_name}").empty? end false end def attribute_present(result) return unless result.is_a? Hpricot::Elem match = true #If v = nil, the value of the attribute can be arbitrary; #Therefore, in this case we just have to make sure that the attribute is #present (i.e. != nil), we don't care about the value @target.each do |k,v| if v == nil match &&= (result.attributes[k.to_s] != nil) else match &&= (result.attributes[k.to_s] == v.to_s) end end match end end #end of class end #end of module