wikicrawl/app/pages.py

# Pages module defines classes for interacting with wikipedia pages.
# There are separate classes defined for each page with their own
# defined methods for performing certain actions.

import logging
import re
import selenium
import time

from . import browser
from . import config

def breakpoint():
    """
    If DO_BREAKPOINTS is switched on, this will pause program
    execution and wait for the user to press enter to continue.
    """
    if config.obj.DO_BREAKPOINTS:
        input('Breakpoint here. <Enter> to continue...')

class PageRootObject:
    """
    Common interface methods for working with pages. The specific
    page classes below inherit these methods and define additional methods
    so every page has available these methods and any additional
    methods they define.

    In here are some re-used methods to click links and highlight
    elements in the browser.
    """
    def __init__(self, driver=None):
        """
        Object constructor for initializing the instance of this
        class with internal variables needed.

        Args:
            driver: Reference to the selenium webdriver object
        that is used to interface with the web browser.
        """
        if not driver:
            self.driver = browser.create_webdriver()
        else:
            self.driver = driver

    def click(self, el):
        """
        Clicks a link in the browser and also highlights it to the
        end user.

        Args:
            el: selenium element to be clicked. Typically an anchor
        html link in the page.
        """
        self.highlight(el, 'red')
        time.sleep(config.obj.PAGE_DELAY)
        breakpoint()
        el.click()

    def highlight(self, el, color):
        """
        Highlights an html element in the web browser by changing the
        background color as well as making the text bold.

        The implementation uses javascript to alter the css of the element.

        Args:
            el: selenium element to be highlighted.
            color: background color to highlight. Input can be one of
        'red', 'blue', or hex code such as '#ffffff'.
        """
        # Note: The way hex codes work is there are 1 byte (2 hex characters)
        # for every color. #RRGGBB for (red, green, blue). This can be thought
        # of as an integer 0-255 for red, green, and blue in base-16 hexadecimal.
        if color == 'red':
            js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292'
        elif color == 'blue':
            js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
        else:
            js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
        self.driver.execute_script(js, el)

class LandingPage(PageRootObject):
    """
    Interface for working with the wikipedia.org landing page. This page has links to
    select a language and go to the respective wikipedia root page.
    """

    # Note: This is the LandingPage() object constructor. All it does right now is
    # reference the parent (PageRootObject) constructor method and call it. This
    # calls PageRootObject.__init__(driver) which makes the web driver available
    # in the object instance.
    def __init__(self, driver=None):
        super().__init__(driver)

    def goto_landing_page(self):
        self.driver.get(config.obj.PAGE_BASE_URL)

    def select_language(self, language):
        link = self.driver.find_element_by_partial_link_text(language)
        self.click(link)

class MainPage(PageRootObject):
    """
    Interface for a selected language root page. This has the link to go to a random article
    and has a featured article. An example url for this is https://en.wikipedia.org.
    """
    def __init__(self, driver=None):
        super().__init__(driver)

    def goto_random_article(self):
        link = self.driver.find_element_by_xpath('//li[contains(@id, "n-randompage")]/a')
        self.click(link)

class ArticlePage(PageRootObject):
    """
    Interface for a wikipedia article page. Here are defined some utility methods to
    try and click the first valid link and extract some information from the page.
    """

    # Here are static class-scoped variables that are needed to work with the page.
    # These are used to locate html elements in the web browser. There are many
    # ways to locate elements but one of the best if available is locating by id. It's
    # not enforced but the html specification mandates that element id's are unique
    # so if you can select by id in a semanticly correct web page, you can correctly
    # select unique elements with high confidence.
    elements = {
        'main-window-content-text-id': 'mw-content-text',
        'article-title': 'firstHeading',
    }

    def __init__(self, driver=None):
        super().__init__(driver)

    def get_title(self):
        heading = self.driver.find_element_by_id(ArticlePage.elements['article-title'])
        return heading.text

    def click_first_link(self):
        return self._iterate_paragraphs()

    # Note: Here this method has it's name prepended with a single underscore.
    # This is a convention that communicates to the developer that these methods
    # are internal private methods. That means they are not meant to be exposed
    # to the external interface. Python does not restrict calling these methods.
    # You can still call ArticlePage._iterate_paragraphs() but the prefix
    # underscore tells you that it is not intended to be exposed and may be
    # unsafe to call. Depending on the implementation, it may not make sense
    # to directly call this method and may result in undefined and unexpected
    # behavior. _iterate_paragraphs is called internally from the exposed
    # click_first_link() but is never invoked externally.
    def _iterate_paragraphs(self):
        """
        Iterates through paragraphs in the page and attempts to find the first
        valid link. Sometimes the first paragraph does not have a link so this
        needs to go through a few paragraphs and it does not make sense to
        operate on the entire article every time when we're just looking for
        the first link, for performance optimization.
        """
        main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id'])
        paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]')
        for p in paragraphs:
            rc = self._parse_paragraph(p)
            if rc:
                return True

    def _parse_paragraph(self, p):
        links = p.find_elements_by_xpath('.//a')
        if len(links) == 0:
            return False
        for link in links:
            logging.debug('processing link: %s' % link.text)
            if not self._is_valid_link(p, link):
                logging.debug('skipping link inside parenthesis: %s' % link.text)
                self.highlight(link, 'blue')
                continue
            self.highlight(link, 'red')
            logging.info('selected link: %s' % link.text)
            self.click(link)
            return True

    def _is_valid_link(self, p, el):
        a = self._is_link_in_parenthesis(p, el)
        b = self._is_link_a_footnote(el)
        c = self._is_link_pronounciation(el)
        d = self._is_link_audio(el)
        if not a and not b and not c and not d:
            return True
        return False

    def _is_link_in_parenthesis(self, p, el):
        """
        Determine if a given link element is inside a set
        of textual parenthesis.
        """
        # Implementation notes (mg):
        # I've tried a few different ways to do this and it's
        # hard to get it to work in every case. I want to avoid
        # certain links and usually avoid links inside parenthetical
        # notes. Some edge cases are nested parenthesis, links with
        # non-english characters (which are displayed with a tree
        # of elements in the html rather than a simply link). And
        # sometimes, the link inside the parenthesis may be a valid
        # target. I've made it so that skipped links show up as blue
        # and determined-valid links highlight as red.
        link_text = el.get_attribute('outerHTML')
        p_text = p.get_attribute('innerHTML')

        regex_str = '\(.*?\)' # Regular expression to extract the
                              # text inside (not nested) parenthesis
        regex = re.compile(regex_str, flags=re.UNICODE)
        match = regex.search(p_text)
        if not match:
            # There are no parenthesis at all in this paragraph.
            return False

        while match is not None:
            # There may be multiple parenthesis (or nested). This
            # iterates through them and checks if the links html
            # is present inside these parenthesis.
            #
            # Care must be taken with regular expressions as they are
            # user/developer unfriendly, hard-to-read, and unforgiving.
            # For example, what happens when you try to match (<anything>)
            # inside of (some words) some more words (even more words), you
            # can match unpaired parenthesis and the computer will return
            # unexpected results. The code is quite dumb and does exactly
            # what you tell it to.
            match_text = match.group(0)
            match_idx = match.end(0)
            if link_text in match_text:
                return True
            match = regex.search(p_text, match_idx+1)

        return False

    def _is_link_a_footnote(self, el):
        # Some links are anchors to footnotes, e.g. [1] that points to a source
        # at the bottom of the page. These aren't valid links for our purpose
        # so this method looks for that and determines if the reference element
        # appears to be a link to a footnote.
        href = el.get_attribute('href')
        if '#cite_note' in href:
            return True
        if '#cnote' in href:
            return True
        return False

    def _is_link_pronounciation(self, el):
        # Some links point to the wikipedia IPA (international phonetic
        # alphabet) pronounciation help page. We don't want to click these
        # links so we scan for and ignore them.
        href = el.get_attribute('href')
        if '/wiki/Help:IPA' in href:
            return True
        return False

    def _is_link_audio(self, el):
        # Some links are audio playback pronounciations. We look for these
        # by checking for the file extension .ogg (an audio file format,
        # ogg-vorbis) and ignoring links if they are of that type.
        href = el.get_attribute('href')
        if '.ogg' in href:
            return True
        return False