lots of documentation and safer functionality

2025-08-20 09:09:46 -06:00 · 2017-08-22 17:50:33 -06:00 · 2017-08-22 17:50:33 -06:00 · 5d88690ded
commit 5d88690ded
parent 64093c58a2
11 changed files with 392 additions and 99 deletions
--- a/README.md
+++ b/README.md
@ -1,4 +1 @@
-You need selenium-server installed and running:
-
-java -jar /usr/share/selenium-server/selenium-server-standalone.jar -timeout 0

--- a/app/init.py
+++ b/app/init.py
@ -1 +1,12 @@
+# The __init__.py file signals to the python interpreter that the
+# app directory is a package. A package is a special module that
+# contains other modules. Each file is a module (browser, cli, etc.)
+# and the "app" package is a module that contains other modules.
+
+# The "app" module exports the stuff exposed here. We export
+# app.init() as a reference to app.config.init() and app.main
+# as a reference to app.cli.main
+
+from .config import init
+from .cli import main

--- a/app/browser.py
+++ b/app/browser.py
@ -1,13 +1,25 @@
+# browser module defines functions for creating selenium webdriver
+# objects. The way this works is selenium is a third-party library
+# that gives a common interface for interacting with web browsers,
+# i.e. chrome, firefox, internet explorer, and even a pseudo-browser.
+#
+# This library (selenium) creates a web browser process
+# (chrome will actually fire up for you to see) and gives you
+# a webdriver object interface to programmatically control the browser.
+# You can do things like click on links, extract information from the
+# page, pass control to a user... the limit is your imagination.
+
 import selenium
 import selenium.webdriver
 import logging

-settings = {}
-
-def init(settings_obj):
-    global settings
-    settings = settings_obj
+from . import config

+# This function has a parameter (driver) that passes in a value. In this case,
+# this driver variable defaults to the string 'chrome'. The code can call
+# create_webdriver() which is the same as create_webdriver('chrome') but
+# can alternatively call create_webdriver('firefox') and get different
+# functionality.
 def create_webdriver(driver='chrome'):
    if driver == 'chrome':
        return create_webdriver_chrome()
@ -19,9 +31,7 @@ def create_webdriver_firefox():

 def create_webdriver_chrome():
    opt = selenium.webdriver.chrome.options.Options()
-    opt.add_argument('--user-agent=' + settings.WEBDRIVER_USER_AGENT)
-    opt.add_argument('--kiosk-printing')
-    opt.add_argument("--focus-existing-tab-on-open=false")
+    opt.add_argument('--user-agent=' + config.obj.WEBDRIVER_USER_AGENT)
    driver = selenium.webdriver.Chrome(chrome_options = opt)
    return driver

--- a/app/cli.py
+++ b/app/cli.py
@ -1,4 +1,8 @@
 #!/usr/bin/env python
+# The command-line interface module creates a interface for
+# interacting with the python program (wikicrawl). This is an implementation
+# of the baker demo shown previously. The user can type in commands to
+# make the program do things.

 import baker
 import logging
@ -6,18 +10,13 @@ import readline # Needed for command history <up> and <down> arrows to work
 import sys

 from . import model
+from . import config

 # Problem pages:
 # Decision (from politics)
 # Malaysia (goes inside parenthesis)

 commander = baker.Baker()
-settings = {}
-
-def init(settings_obj):
-    global settings
-    settings = settings_obj
-    model.init(settings_obj)

 def main():
    user_interface = InteractiveInterface()
@ -31,9 +30,17 @@ def main():
 class InteractiveInterface:
    def __init__(self):
        self.model = model.Model()
-        x = self.model.webdriver # Request the browser open immediately

    def run(self, args, main=True):
+        """
+        Runs the command-line interface for a single command.
+
+        If called by InteractiveInterface.run(sys.argv), this method
+        will execute the commands and arguments specified on command
+        line when running this program. Alternatively, the code could
+        pass in a different set of arguments to specify what to do.
+        See start_command_loop() for more information.
+        """
        try:
            commander.run(argv=args, main=True, help_on_error=True,
                          instance=self)
@ -49,20 +56,103 @@ class InteractiveInterface:
    def start_command_loop(self):
        """
        Repeatedly asks the user what command to run until they exit.
+
+        This method calls InteractiveInterface.run(args) a little bit
+        differently. Instead of passing the arguments from the command-line
+        that were passed in when invoking the python wikicrawl app,
+        this asks the user for a line of textual input and passes
+        those strings to run() as the arguments. This way, the user can
+        access an interactive shell and repeatedly issue different
+        commands while the application is running.
        """
        commander.usage()
+        self.model.open_browser()
        while True:
            print('$ ', end = '') # Display to the user a command prompt
+                                  # The dollar-sign is a common indication
+                                  # of a shell that communicates to the user
+                                  # that we are waiting for their textual
+                                  # input. The end = '' indicates to python
+                                  # to NOT drop to a newline after printing
+                                  # in the terminal. Instead, let the user
+                                  # type their command on the same line as
+                                  # our printed '$ '.
            try:
                inp = input()
            except EOFError: # <ctrl>+D will send "End Line" and exit the command loop
                break
-            args = ['', ] + inp.split()
+            # Note in arguments (mg):
+            # Whenever a program is run in windows or *nix, the operating
+            # system passes in the command string that was used to invoke
+            # the program. You can append data in that command to configure
+            # switches or values going into the program on the fly. For
+            # example, you can invoke this wikicrawl app in more than one
+            # way. You can of course run "python launcher.py" to run the
+            # software but you can also pass in an argument. You can
+            # alternatively run "python launcher.py <argument> <argument>..."
+            # and the operating system will provide the <argument> values into
+            # the process that is running.
+            #
+            # In a real world use case, many commands provide switches to
+            # adjust what the program does. For example,
+            #
+            # The command:
+            #     find music -iname "*justin*bieber*"
+            # runs the "find" program and asks to find all the filenames that match the
+            # pattern *justin*bieber* in the "music" directory.
+            # (music, -iname, "*justin*biever*") are argument parameters
+            # that are passed into the program. The program is coded to
+            # parse and interpret these values and execute differently based
+            # on the values passed in. This is one way to pass in information
+            # into a running program. Some other ways are to read from a file
+            # (such as how we read from settings.py to load the runtime
+            # configuration), from something called environment variables
+            # (won't get into but another set of values provided to programs
+            # from the operating system), or they can be hard-coded into
+            # the application.
+            #
+            # Side note: arguments are not unique to python (almost all
+            # programming languages implement arguments), the functionality
+            # is defined by the application (some programs require arguments,
+            # some are optional, and the syntax for sending in argument
+            # parameters are different and defined by the individual programs,
+            # and lastly, the first argument sent in is the script name or
+            # filename of the script. In our case, the first argument is
+            # the string "launcher.py". If the user invoked the command
+            # as C:\Users\mguest\launcher.py then the first argument
+            # would be C:\Users\mguest\launcher.py.

-            if "--help" in args:
-                args.remove("--help")
+            # What this method (start_command_loop()) does is provide a
+            # REPL which is a
+            # read-eval-print-loop. It repeatedly asks the user for an
+            # input (read), evaluates that input into an action (evaluate),
+            # give the user some feedback (print), and start the process
+            # over again (loop). When you call "python", you are given a python
+            # process that gives you a REPL interactive shell. The way
+            # this wikicrawl app is implemented gives the user a REPL
+            # that has commands to interact with wikipedia pages.
+            args = [sys.argv[0], ] + inp.split()
+
+            # The user can at any point in the command pass the argument
+            # switch "--help". If doing this, the command line interface
+            # will instead print out the inline documentation associated
+            # with this command and quit after doing so. For example,
+            # the user can type "python launcher.py do_random_page --help"
+            # and the program will spit out the generated documentation
+            # for the do_random_page command and run nothing. In our case,
+            # this documentation is created by the baker library and will
+            # print out the docstring associated with the method. Try it
+            # out in your shell (cmd.exe or powershell.exe) by invoking
+            #     python launcher.py do_random_page --help
+            # You will see the program spit out the heredoc below the
+            # do_random_page method defined below.
+
+            if '--help' in args:
+                args.remove('--help')
                try:
+                    print('command usage:')
                    commander.usage(args[1])
+                    return
                except Exception as ex:
                    print(type(ex), ex)
                continue
@ -71,13 +161,21 @@ class InteractiveInterface:

    @commander.command
    def do_random_page(self):
+        """
+        Instructs the wikicrawl application to play the game on a random
+        article.
+        """
        self.model.do_random_page()

    @commander.command
    def do_n_pages(self, n):
+        """
+        Plays the wikicrawl game <n>-times.
+        """
        try:
            n = int(n)
        except ValueError as ex:
+            logging.warn('failed to process "%s" as a parameter' % n)
            return False
        for i in range(n):
            self.model.do_random_page()
--- a/app/config.py
+++ b/app/config.py
@ -0,0 +1,13 @@
+# config module defines a place to store the external configuration/settings
+# and is used to provide an interface to the runtime configuration for the
+# program.
+
+from . import log
+
+obj = {}
+
+def init(settings_obj):
+    global obj
+    obj = settings_obj
+    log.init_logging()
+
--- a/app/dal.py
+++ b/app/dal.py
@ -2,11 +2,7 @@ import sqlite3
 import pycurl
 import os

-settings = {}
-
-def init(settings_obj):
-    global settings
-    settings = settings_obj
+from . import config

 class DataLayer:
    def __init__(self):
--- a/app/log.py
+++ b/app/log.py
@ -1,11 +1,7 @@
 import logging

-settings = {}
-
-def init(settings_obj):
-    global settings
-    settings = settings_obj
-    init_logging()
+from . import config

 def init_logging():
-    logging.basicConfig(level=settings.LOG_LEVEL)
+    logging.basicConfig(level=config.obj.LOG_LEVEL)
+
--- a/app/model.py
+++ b/app/model.py
@ -1,43 +1,96 @@
+# model module contains the business logic of the program. Notice
+# the command-line interface contains no business logic and only
+# has functionality to call on the model. Similarly, the page
+# objects define no specific functionality for doing the wiki crawl
+# but only provide general utility methods that are called upon
+# to implement the wiki crawl. This is a separation of concerns
+# and keeps the logic organized and separated.
+
 import logging
 import os
 import time

 from . import browser
-from . import log
+from . import config
 from . import dal
+from . import log
 from . import pages

-settings = {}
-
-def init(settings_obj):
-    global settings
-    settings = settings_obj
-
-    browser.init(settings_obj)
-    dal.init(settings_obj)
-    pages.init(settings_obj)
-    log.init(settings_obj)
-
 class Model:
    def __init__(self):
        self._webdriver = None

    @property
    def webdriver(self):
+        # The way this works is when an object instance of class/type Model
+        # is called with x.webdriver, Model runs webdriver(). In our case,
+        # the webdriver() method checks if a private variable self._webdriver
+        # exists and if it isn't, asks for a new selenim object. The result
+        # is that this will on-demand create a browser. If one exists, it will
+        # use the one that exists and if one doesn't exists, it will create
+        # one and use that. External code can rely on self.webdriver
+        # always existing with or without knowing if it exists because if it
+        # hasn't been created yet then it will be created on-the-fly.
        if not self._webdriver:
-            self._webdriver = browser.create_webdriver(settings.WEBDRIVER_BROWSER)
+            self._webdriver = browser.create_webdriver(config.obj.WEBDRIVER_BROWSER)
            page_api = pages.LandingPage(self.webdriver)
            page_api.goto_landing_page()
        return self._webdriver

-    def do_random_page(self):
-        # Landing page (select language)
-        page_api = pages.LandingPage(self.webdriver)
-        page_api.goto_landing_page()
-        page_api.select_language(settings.PAGE_LANGUAGE)
+    def open_browser(self):
+        x = self.webdriver # Request the browser open immediately.
+                           # Without this, the Model object will
+                           # be created on-demand (as defined in
+                           # the Model class). This means that the
+                           # web browser will not open until
+                           # a command is typed in. But because we
+                           # request the webdriver right here, Model
+                           # creates it and then it is re-used later
+                           # in the application.

-        # Main page
+    def do_random_page(self):
+        """
+        Select a random page and repeatedly click the first link until
+        we reach the article on philosophy. Sometimes, the driver encounters
+        a loop and will never reach the page and sometimes the parser
+        fails and we fail to programmatically implement what we're trying to
+        do correctly.
+        """
+        # The following 3 lines include the functionality
+        # for the Landing page (select language).
+
+        # This line creates a new object (page_api) which is an instance
+        # of type class pages.LandingPage. LandingPage is a variable
+        # containing a class definition that is located in the pages module.
+        # We pass self.webdriver as arguments into the LandingPage.__init__
+        # constructor.
+        page_api = pages.LandingPage(self.webdriver)
+
+        # This line calls the page_api object's (an instance of
+        # pages.LandingPage type) method goto_landing_page. 
+        page_api.goto_landing_page()
+
+        # Similarly, this line calls the select_language method
+        # and passes in values from our runtime configuration.
+        # In this case, we have made the language a parameter
+        # that you can pass into the program, i.e. you can run it
+        # for English or Spanish or Russian or what have you.
+        page_api.select_language(config.obj.PAGE_LANGUAGE)
+
+        # Main page: next 2 lines
+
+        # At this point, we have clicked a link and changes the page. We
+        # re-create our page interface as a new object which is of
+        # a different class and includes distinct code for working with
+        # the page. In this case, we delete page_api and re-create it
+        # as an object of type pages.MainPage. Again, we pass in
+        # self.webdriver as an object of the selenium webdriver interface.
+        # The page_api calls methods on this webdriver to make the web
+        # browser do various things like click links or extract text.
        page_api = pages.MainPage(self.webdriver)
+
+        # We call pages.MainPage.goto_random_article() to perform
+        # the action we're trying to invoke.
        page_api.goto_random_article()

        # Article page
@ -62,4 +115,3 @@ class Model:
                break
            print()

-
--- a/app/pages.py
+++ b/app/pages.py
@ -1,50 +1,109 @@
+# Pages module defines classes for interacting with wikipedia pages.
+# There are separate classes defined for each page with their own
+# defined methods for performing certain actions.
+
 import logging
 import re
 import selenium
-import selenium.webdriver
 import time

-settings = {}
-
-def init(settings_obj):
-    global settings
-    settings = settings_obj
+from . import browser
+from . import config

 def breakpoint():
-    if settings.DO_BREAKPOINTS:
+    """
+    If DO_BREAKPOINTS is switched on, this will pause program
+    execution and wait for the user to press enter to continue.
+    """
+    if config.obj.DO_BREAKPOINTS:
        input('Breakpoint here. <Enter> to continue...')

 class PageRootObject:
+    """
+    Common interface methods for working with pages. The specific
+    page classes below inherit these methods and define additional methods
+    so every page has available these methods and any additional
+    methods they define.
+
+    In here are some re-used methods to click links and highlight
+    elements in the browser.
+    """
    def __init__(self, driver=None):
+        """
+        Object constructor for initializing the instance of this
+        class with internal variables needed.
+
+        Args:
+            driver: Reference to the selenium webdriver object
+        that is used to interface with the web browser.
+        """
        if not driver:
-            self.driver = create_webdriver()
+            self.driver = browser.create_webdriver()
        else:
            self.driver = driver

    def click(self, el): 
+        """
+        Clicks a link in the browser and also highlights it to the
+        end user.
+
+        Args:
+            el: selenium element to be clicked. Typically an anchor
+        html link in the page.
+        """
        self.highlight(el, 'red')
-        time.sleep(settings.PAGE_DELAY)
+        time.sleep(config.obj.PAGE_DELAY)
+        breakpoint()
        el.click()

    def highlight(self, el, color):
+        """
+        Highlights an html element in the web browser by changing the
+        background color as well as making the text bold.
+
+        The implementation uses javascript to alter the css of the element.
+
+        Args:
+            el: selenium element to be highlighted.
+            color: background color to highlight. Input can be one of
+        'red', 'blue', or hex code such as '#ffffff'.
+        """
+        # Note: The way hex codes work is there are 1 byte (2 hex characters)
+        # for every color. #RRGGBB for (red, green, blue). This can be thought
+        # of as an integer 0-255 for red, green, and blue in base-16 hexadecimal.
        if color == 'red':
            js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292'
        elif color == 'blue':
            js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
+        else:
+            js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
        self.driver.execute_script(js, el)

 class LandingPage(PageRootObject):
+    """
+    Interface for working with the wikipedia.org landing page. This page has links to
+    select a language and go to the respective wikipedia root page.
+    """
+
+    # Note: This is the LandingPage() object constructor. All it does right now is
+    # reference the parent (PageRootObject) constructor method and call it. This
+    # calls PageRootObject.__init__(driver) which makes the web driver available
+    # in the object instance.
    def __init__(self, driver=None):
        super().__init__(driver)

    def goto_landing_page(self):
-        self.driver.get(settings.PAGE_BASE_URL)
+        self.driver.get(config.obj.PAGE_BASE_URL)

    def select_language(self, language):
        link = self.driver.find_element_by_partial_link_text(language)
        self.click(link)

 class MainPage(PageRootObject):
+    """
+    Interface for a selected language root page. This has the link to go to a random article
+    and has a featured article. An example url for this is https://en.wikipedia.org.
+    """
    def __init__(self, driver=None):
        super().__init__(driver)

@ -53,7 +112,17 @@ class MainPage(PageRootObject):
        self.click(link)

 class ArticlePage(PageRootObject):
+    """
+    Interface for a wikipedia article page. Here are defined some utility methods to
+    try and click the first valid link and extract some information from the page.
+    """

+    # Here are static class-scoped variables that are needed to work with the page.
+    # These are used to locate html elements in the web browser. There are many
+    # ways to locate elements but one of the best if available is locating by id. It's
+    # not enforced but the html specification mandates that element id's are unique
+    # so if you can select by id in a semanticly correct web page, you can correctly
+    # select unique elements with high confidence.
    elements = {
        'main-window-content-text-id': 'mw-content-text',
        'article-title': 'firstHeading',
@ -69,7 +138,24 @@ class ArticlePage(PageRootObject):
    def click_first_link(self):
        return self._iterate_paragraphs()

+    # Note: Here this method has it's name prepended with a single underscore.
+    # This is a convention that communicates to the developer that these methods
+    # are internal private methods. That means they are not meant to be exposed
+    # to the external interface. Python does not restrict calling these methods.
+    # You can still call ArticlePage._iterate_paragraphs() but the prefix
+    # underscore tells you that it is not intended to be exposed and may be
+    # unsafe to call. Depending on the implementation, it may not make sense
+    # to directly call this method and may result in undefined and unexpected
+    # behavior. _iterate_paragraphs is called internally from the exposed
+    # click_first_link() but is never invoked externally.
    def _iterate_paragraphs(self):
+        """
+        Iterates through paragraphs in the page and attempts to find the first
+        valid link. Sometimes the first paragraph does not have a link so this
+        needs to go through a few paragraphs and it does not make sense to
+        operate on the entire article every time when we're just looking for
+        the first link, for performance optimization.
+        """
        main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id'])
        paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]')
        for p in paragraphs:
@ -89,8 +175,7 @@ class ArticlePage(PageRootObject):
                continue
            self.highlight(link, 'red')
            logging.info('selected link: %s' % link.text)
-            breakpoint()
-            link.click()
+            self.click(link)
            return True

    def _is_valid_link(self, p, el):
@ -98,45 +183,61 @@ class ArticlePage(PageRootObject):
        b = self._is_link_a_footnote(el)
        c = self._is_link_pronounciation(el)
        d = self._is_link_audio(el)
-        print(a, b, c, d)
        if not a and not b and not c and not d:
            return True
        return False

    def _is_link_in_parenthesis(self, p, el):
-        # link_text = el.text
+        """
+        Determine if a given link element is inside a set
+        of textual parenthesis.
+        """
+        # Implementation notes (mg):
+        # I've tried a few different ways to do this and it's
+        # hard to get it to work in every case. I want to avoid
+        # certain links and usually avoid links inside parenthetical
+        # notes. Some edge cases are nested parenthesis, links with
+        # non-english characters (which are displayed with a tree
+        # of elements in the html rather than a simply link). And
+        # sometimes, the link inside the parenthesis may be a valid
+        # target. I've made it so that skipped links show up as blue
+        # and determined-valid links highlight as red.
        link_text = el.get_attribute('outerHTML')
        p_text = p.get_attribute('innerHTML')

-        regex_str = '\(.*?\)'
+        regex_str = '\(.*?\)' # Regular expression to extract the
+                              # text inside (not nested) parenthesis
        regex = re.compile(regex_str, flags=re.UNICODE)
        match = regex.search(p_text)
        if not match:
+            # There are no parenthesis at all in this paragraph.
            return False

        while match is not None:
+            # There may be multiple parenthesis (or nested). This
+            # iterates through them and checks if the links html
+            # is present inside these parenthesis.
+            #
+            # Care must be taken with regular expressions as they are
+            # user/developer unfriendly, hard-to-read, and unforgiving.
+            # For example, what happens when you try to match (<anything>)
+            # inside of (some words) some more words (even more words), you
+            # can match unpaired parenthesis and the computer will return
+            # unexpected results. The code is quite dumb and does exactly
+            # what you tell it to.
            match_text = match.group(0)
            match_idx = match.end(0)
-            print(link_text)
-            print(match_text)
            if link_text in match_text:
                return True
-
            match = regex.search(p_text, match_idx+1)

-        # Is the link inside parenthesis?
-        # regex_str = '\([^()]*<a.*?>%s</a>[^())]*\)' % re.escape(link_text)
-        # regex_str = '\(.*<a.*?>%s.*\)' % link_text
-        # print(regex_str)
-        # regex = re.compile(regex_str, flags=re.UNICODE)
-        # match = re.search(regex_str, p_text)
-        # if match: # Pattern is found in the text
-        #     print(match.group(0))
-        #     return True
-        # else:
-        #     return False
+        return False

    def _is_link_a_footnote(self, el):
+        # Some links are anchors to footnotes, e.g. [1] that points to a source
+        # at the bottom of the page. These aren't valid links for our purpose
+        # so this method looks for that and determines if the reference element
+        # appears to be a link to a footnote.
        href = el.get_attribute('href')
        if '#cite_note' in href:
            return True
@ -145,12 +246,18 @@ class ArticlePage(PageRootObject):
        return False

    def _is_link_pronounciation(self, el):
+        # Some links point to the wikipedia IPA (international phonetic
+        # alphabet) pronounciation help page. We don't want to click these
+        # links so we scan for and ignore them.
        href = el.get_attribute('href')
        if '/wiki/Help:IPA' in href:
            return True
        return False

    def _is_link_audio(self, el):
+        # Some links are audio playback pronounciations. We look for these
+        # by checking for the file extension .ogg (an audio file format,
+        # ogg-vorbis) and ignoring links if they are of that type.
        href = el.get_attribute('href')
        if '.ogg' in href:
            return True
--- a/launcher.py
+++ b/launcher.py
@ -1,6 +1,9 @@
-import app.cli
+import app
 import settings

-app.cli.init(settings.Settings)
-app.cli.main()
+# Inject the settings.DefaultSettings object into the
+# app and start running the program.
+app.init(settings.DefaultSettings)
+app.main()
 input('<enter> to exit')
+
--- a/settings.py
+++ b/settings.py
@ -1,6 +1,16 @@
+# Application run-time configuration/settings. This contains variables
+# that control how the program works but are kept separate from the
+# program. It makes sense for certain parameters to be adjustable but
+# not hard-coded into the application. For example, some users may want
+# to run this program in English while others may want to run in Spanish.
+# The way this works is we specify those variables external from the
+# application (here) and pass them into the application (app.config module).
+# The application then references app.config.obj to access the variables
+# passed in from here.
+
 import logging

-class Settings:
+class DefaultSettings:
    # Application Parameters
    LOG_LEVEL = logging.INFO
    DO_BREAKPOINTS = True