266 lines
11 KiB
Python
266 lines
11 KiB
Python
# Pages module defines classes for interacting with wikipedia pages.
|
|
# There are separate classes defined for each page with their own
|
|
# defined methods for performing certain actions.
|
|
|
|
import logging
|
|
import re
|
|
import selenium
|
|
import time
|
|
|
|
from . import browser
|
|
from . import config
|
|
|
|
def breakpoint():
|
|
"""
|
|
If DO_BREAKPOINTS is switched on, this will pause program
|
|
execution and wait for the user to press enter to continue.
|
|
"""
|
|
if config.obj.DO_BREAKPOINTS:
|
|
input('Breakpoint here. <Enter> to continue...')
|
|
|
|
class PageRootObject:
|
|
"""
|
|
Common interface methods for working with pages. The specific
|
|
page classes below inherit these methods and define additional methods
|
|
so every page has available these methods and any additional
|
|
methods they define.
|
|
|
|
In here are some re-used methods to click links and highlight
|
|
elements in the browser.
|
|
"""
|
|
def __init__(self, driver=None):
|
|
"""
|
|
Object constructor for initializing the instance of this
|
|
class with internal variables needed.
|
|
|
|
Args:
|
|
driver: Reference to the selenium webdriver object
|
|
that is used to interface with the web browser.
|
|
"""
|
|
if not driver:
|
|
self.driver = browser.create_webdriver()
|
|
else:
|
|
self.driver = driver
|
|
|
|
def click(self, el):
|
|
"""
|
|
Clicks a link in the browser and also highlights it to the
|
|
end user.
|
|
|
|
Args:
|
|
el: selenium element to be clicked. Typically an anchor
|
|
html link in the page.
|
|
"""
|
|
self.highlight(el, 'red')
|
|
time.sleep(config.obj.PAGE_DELAY)
|
|
breakpoint()
|
|
el.click()
|
|
|
|
def highlight(self, el, color):
|
|
"""
|
|
Highlights an html element in the web browser by changing the
|
|
background color as well as making the text bold.
|
|
|
|
The implementation uses javascript to alter the css of the element.
|
|
|
|
Args:
|
|
el: selenium element to be highlighted.
|
|
color: background color to highlight. Input can be one of
|
|
'red', 'blue', or hex code such as '#ffffff'.
|
|
"""
|
|
# Note: The way hex codes work is there are 1 byte (2 hex characters)
|
|
# for every color. #RRGGBB for (red, green, blue). This can be thought
|
|
# of as an integer 0-255 for red, green, and blue in base-16 hexadecimal.
|
|
if color == 'red':
|
|
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292'
|
|
elif color == 'blue':
|
|
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
|
|
else:
|
|
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
|
|
self.driver.execute_script(js, el)
|
|
|
|
class LandingPage(PageRootObject):
|
|
"""
|
|
Interface for working with the wikipedia.org landing page. This page has links to
|
|
select a language and go to the respective wikipedia root page.
|
|
"""
|
|
|
|
# Note: This is the LandingPage() object constructor. All it does right now is
|
|
# reference the parent (PageRootObject) constructor method and call it. This
|
|
# calls PageRootObject.__init__(driver) which makes the web driver available
|
|
# in the object instance.
|
|
def __init__(self, driver=None):
|
|
super().__init__(driver)
|
|
|
|
def goto_landing_page(self):
|
|
self.driver.get(config.obj.PAGE_BASE_URL)
|
|
|
|
def select_language(self, language):
|
|
link = self.driver.find_element_by_partial_link_text(language)
|
|
self.click(link)
|
|
|
|
class MainPage(PageRootObject):
|
|
"""
|
|
Interface for a selected language root page. This has the link to go to a random article
|
|
and has a featured article. An example url for this is https://en.wikipedia.org.
|
|
"""
|
|
def __init__(self, driver=None):
|
|
super().__init__(driver)
|
|
|
|
def goto_random_article(self):
|
|
link = self.driver.find_element_by_xpath('//li[contains(@id, "n-randompage")]/a')
|
|
self.click(link)
|
|
|
|
class ArticlePage(PageRootObject):
|
|
"""
|
|
Interface for a wikipedia article page. Here are defined some utility methods to
|
|
try and click the first valid link and extract some information from the page.
|
|
"""
|
|
|
|
# Here are static class-scoped variables that are needed to work with the page.
|
|
# These are used to locate html elements in the web browser. There are many
|
|
# ways to locate elements but one of the best if available is locating by id. It's
|
|
# not enforced but the html specification mandates that element id's are unique
|
|
# so if you can select by id in a semanticly correct web page, you can correctly
|
|
# select unique elements with high confidence.
|
|
elements = {
|
|
'main-window-content-text-id': 'mw-content-text',
|
|
'article-title': 'firstHeading',
|
|
}
|
|
|
|
def __init__(self, driver=None):
|
|
super().__init__(driver)
|
|
|
|
def get_title(self):
|
|
heading = self.driver.find_element_by_id(ArticlePage.elements['article-title'])
|
|
return heading.text
|
|
|
|
def click_first_link(self):
|
|
return self._iterate_paragraphs()
|
|
|
|
# Note: Here this method has it's name prepended with a single underscore.
|
|
# This is a convention that communicates to the developer that these methods
|
|
# are internal private methods. That means they are not meant to be exposed
|
|
# to the external interface. Python does not restrict calling these methods.
|
|
# You can still call ArticlePage._iterate_paragraphs() but the prefix
|
|
# underscore tells you that it is not intended to be exposed and may be
|
|
# unsafe to call. Depending on the implementation, it may not make sense
|
|
# to directly call this method and may result in undefined and unexpected
|
|
# behavior. _iterate_paragraphs is called internally from the exposed
|
|
# click_first_link() but is never invoked externally.
|
|
def _iterate_paragraphs(self):
|
|
"""
|
|
Iterates through paragraphs in the page and attempts to find the first
|
|
valid link. Sometimes the first paragraph does not have a link so this
|
|
needs to go through a few paragraphs and it does not make sense to
|
|
operate on the entire article every time when we're just looking for
|
|
the first link, for performance optimization.
|
|
"""
|
|
main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id'])
|
|
paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]')
|
|
for p in paragraphs:
|
|
rc = self._parse_paragraph(p)
|
|
if rc:
|
|
return True
|
|
|
|
def _parse_paragraph(self, p):
|
|
links = p.find_elements_by_xpath('.//a')
|
|
if len(links) == 0:
|
|
return False
|
|
for link in links:
|
|
logging.debug('processing link: %s' % link.text)
|
|
if not self._is_valid_link(p, link):
|
|
logging.debug('skipping link inside parenthesis: %s' % link.text)
|
|
self.highlight(link, 'blue')
|
|
continue
|
|
self.highlight(link, 'red')
|
|
logging.info('selected link: %s' % link.text)
|
|
self.click(link)
|
|
return True
|
|
|
|
def _is_valid_link(self, p, el):
|
|
a = self._is_link_in_parenthesis(p, el)
|
|
b = self._is_link_a_footnote(el)
|
|
c = self._is_link_pronounciation(el)
|
|
d = self._is_link_audio(el)
|
|
if not a and not b and not c and not d:
|
|
return True
|
|
return False
|
|
|
|
def _is_link_in_parenthesis(self, p, el):
|
|
"""
|
|
Determine if a given link element is inside a set
|
|
of textual parenthesis.
|
|
"""
|
|
# Implementation notes (mg):
|
|
# I've tried a few different ways to do this and it's
|
|
# hard to get it to work in every case. I want to avoid
|
|
# certain links and usually avoid links inside parenthetical
|
|
# notes. Some edge cases are nested parenthesis, links with
|
|
# non-english characters (which are displayed with a tree
|
|
# of elements in the html rather than a simply link). And
|
|
# sometimes, the link inside the parenthesis may be a valid
|
|
# target. I've made it so that skipped links show up as blue
|
|
# and determined-valid links highlight as red.
|
|
link_text = el.get_attribute('outerHTML')
|
|
p_text = p.get_attribute('innerHTML')
|
|
|
|
regex_str = '\(.*?\)' # Regular expression to extract the
|
|
# text inside (not nested) parenthesis
|
|
regex = re.compile(regex_str, flags=re.UNICODE)
|
|
match = regex.search(p_text)
|
|
if not match:
|
|
# There are no parenthesis at all in this paragraph.
|
|
return False
|
|
|
|
while match is not None:
|
|
# There may be multiple parenthesis (or nested). This
|
|
# iterates through them and checks if the links html
|
|
# is present inside these parenthesis.
|
|
#
|
|
# Care must be taken with regular expressions as they are
|
|
# user/developer unfriendly, hard-to-read, and unforgiving.
|
|
# For example, what happens when you try to match (<anything>)
|
|
# inside of (some words) some more words (even more words), you
|
|
# can match unpaired parenthesis and the computer will return
|
|
# unexpected results. The code is quite dumb and does exactly
|
|
# what you tell it to.
|
|
match_text = match.group(0)
|
|
match_idx = match.end(0)
|
|
if link_text in match_text:
|
|
return True
|
|
match = regex.search(p_text, match_idx+1)
|
|
|
|
return False
|
|
|
|
def _is_link_a_footnote(self, el):
|
|
# Some links are anchors to footnotes, e.g. [1] that points to a source
|
|
# at the bottom of the page. These aren't valid links for our purpose
|
|
# so this method looks for that and determines if the reference element
|
|
# appears to be a link to a footnote.
|
|
href = el.get_attribute('href')
|
|
if '#cite_note' in href:
|
|
return True
|
|
if '#cnote' in href:
|
|
return True
|
|
return False
|
|
|
|
def _is_link_pronounciation(self, el):
|
|
# Some links point to the wikipedia IPA (international phonetic
|
|
# alphabet) pronounciation help page. We don't want to click these
|
|
# links so we scan for and ignore them.
|
|
href = el.get_attribute('href')
|
|
if '/wiki/Help:IPA' in href:
|
|
return True
|
|
return False
|
|
|
|
def _is_link_audio(self, el):
|
|
# Some links are audio playback pronounciations. We look for these
|
|
# by checking for the file extension .ogg (an audio file format,
|
|
# ogg-vorbis) and ignoring links if they are of that type.
|
|
href = el.get_attribute('href')
|
|
if '.ogg' in href:
|
|
return True
|
|
return False
|
|
|