From 5d88690ded387c496f1d0f8634c4f90543ea9e0d Mon Sep 17 00:00:00 2001 From: Mathew Guest Date: Tue, 22 Aug 2017 17:50:33 -0600 Subject: [PATCH] lots of documentation and safer functionality --- README.md | 3 - app/__init__.py | 11 ++++ app/browser.py | 26 +++++--- app/cli.py | 142 +++++++++++++++++++++++++++++++++------- app/config.py | 13 ++++ app/dal.py | 6 +- app/log.py | 10 +-- app/model.py | 92 ++++++++++++++++++++------ app/pages.py | 167 +++++++++++++++++++++++++++++++++++++++--------- launcher.py | 9 ++- settings.py | 12 +++- 11 files changed, 392 insertions(+), 99 deletions(-) create mode 100644 app/config.py diff --git a/README.md b/README.md index feef72c..8b13789 100644 --- a/README.md +++ b/README.md @@ -1,4 +1 @@ -You need selenium-server installed and running: - -java -jar /usr/share/selenium-server/selenium-server-standalone.jar -timeout 0 diff --git a/app/__init__.py b/app/__init__.py index 8b13789..ca1a8a3 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1 +1,12 @@ +# The __init__.py file signals to the python interpreter that the +# app directory is a package. A package is a special module that +# contains other modules. Each file is a module (browser, cli, etc.) +# and the "app" package is a module that contains other modules. + +# The "app" module exports the stuff exposed here. We export +# app.init() as a reference to app.config.init() and app.main +# as a reference to app.cli.main + +from .config import init +from .cli import main diff --git a/app/browser.py b/app/browser.py index 86a5ce4..59b4cc0 100644 --- a/app/browser.py +++ b/app/browser.py @@ -1,13 +1,25 @@ +# browser module defines functions for creating selenium webdriver +# objects. The way this works is selenium is a third-party library +# that gives a common interface for interacting with web browsers, +# i.e. chrome, firefox, internet explorer, and even a pseudo-browser. +# +# This library (selenium) creates a web browser process +# (chrome will actually fire up for you to see) and gives you +# a webdriver object interface to programmatically control the browser. +# You can do things like click on links, extract information from the +# page, pass control to a user... the limit is your imagination. + import selenium import selenium.webdriver import logging -settings = {} - -def init(settings_obj): - global settings - settings = settings_obj +from . import config +# This function has a parameter (driver) that passes in a value. In this case, +# this driver variable defaults to the string 'chrome'. The code can call +# create_webdriver() which is the same as create_webdriver('chrome') but +# can alternatively call create_webdriver('firefox') and get different +# functionality. def create_webdriver(driver='chrome'): if driver == 'chrome': return create_webdriver_chrome() @@ -19,9 +31,7 @@ def create_webdriver_firefox(): def create_webdriver_chrome(): opt = selenium.webdriver.chrome.options.Options() - opt.add_argument('--user-agent=' + settings.WEBDRIVER_USER_AGENT) - opt.add_argument('--kiosk-printing') - opt.add_argument("--focus-existing-tab-on-open=false") + opt.add_argument('--user-agent=' + config.obj.WEBDRIVER_USER_AGENT) driver = selenium.webdriver.Chrome(chrome_options = opt) return driver diff --git a/app/cli.py b/app/cli.py index 67fdba9..67b8ff9 100644 --- a/app/cli.py +++ b/app/cli.py @@ -1,4 +1,8 @@ #!/usr/bin/env python +# The command-line interface module creates a interface for +# interacting with the python program (wikicrawl). This is an implementation +# of the baker demo shown previously. The user can type in commands to +# make the program do things. import baker import logging @@ -6,18 +10,13 @@ import readline # Needed for command history and arrows to work import sys from . import model +from . import config # Problem pages: # Decision (from politics) # Malaysia (goes inside parenthesis) commander = baker.Baker() -settings = {} - -def init(settings_obj): - global settings - settings = settings_obj - model.init(settings_obj) def main(): user_interface = InteractiveInterface() @@ -31,38 +30,129 @@ def main(): class InteractiveInterface: def __init__(self): self.model = model.Model() - x = self.model.webdriver # Request the browser open immediately def run(self, args, main=True): - try: - commander.run(argv=args, main=True, help_on_error=True, - instance=self) - except baker.CommandError as ex: - logging.warn('incorrect user input: %s' % ex) - commander.usage() - except baker.TopHelp as ex: - commander.usage() - except Exception as ex: - logging.error('caught general exception!!') - print(type(ex), ex) + """ + Runs the command-line interface for a single command. + + If called by InteractiveInterface.run(sys.argv), this method + will execute the commands and arguments specified on command + line when running this program. Alternatively, the code could + pass in a different set of arguments to specify what to do. + See start_command_loop() for more information. + """ + try: + commander.run(argv=args, main=True, help_on_error=True, + instance=self) + except baker.CommandError as ex: + logging.warn('incorrect user input: %s' % ex) + commander.usage() + except baker.TopHelp as ex: + commander.usage() + except Exception as ex: + logging.error('caught general exception!!') + print(type(ex), ex) def start_command_loop(self): """ Repeatedly asks the user what command to run until they exit. + + This method calls InteractiveInterface.run(args) a little bit + differently. Instead of passing the arguments from the command-line + that were passed in when invoking the python wikicrawl app, + this asks the user for a line of textual input and passes + those strings to run() as the arguments. This way, the user can + access an interactive shell and repeatedly issue different + commands while the application is running. """ commander.usage() + self.model.open_browser() while True: print('$ ', end = '') # Display to the user a command prompt + # The dollar-sign is a common indication + # of a shell that communicates to the user + # that we are waiting for their textual + # input. The end = '' indicates to python + # to NOT drop to a newline after printing + # in the terminal. Instead, let the user + # type their command on the same line as + # our printed '$ '. try: inp = input() except EOFError: # +D will send "End Line" and exit the command loop break - args = ['', ] + inp.split() + # Note in arguments (mg): + # Whenever a program is run in windows or *nix, the operating + # system passes in the command string that was used to invoke + # the program. You can append data in that command to configure + # switches or values going into the program on the fly. For + # example, you can invoke this wikicrawl app in more than one + # way. You can of course run "python launcher.py" to run the + # software but you can also pass in an argument. You can + # alternatively run "python launcher.py ..." + # and the operating system will provide the values into + # the process that is running. + # + # In a real world use case, many commands provide switches to + # adjust what the program does. For example, + # + # The command: + # find music -iname "*justin*bieber*" + # runs the "find" program and asks to find all the filenames that match the + # pattern *justin*bieber* in the "music" directory. + # (music, -iname, "*justin*biever*") are argument parameters + # that are passed into the program. The program is coded to + # parse and interpret these values and execute differently based + # on the values passed in. This is one way to pass in information + # into a running program. Some other ways are to read from a file + # (such as how we read from settings.py to load the runtime + # configuration), from something called environment variables + # (won't get into but another set of values provided to programs + # from the operating system), or they can be hard-coded into + # the application. + # + # Side note: arguments are not unique to python (almost all + # programming languages implement arguments), the functionality + # is defined by the application (some programs require arguments, + # some are optional, and the syntax for sending in argument + # parameters are different and defined by the individual programs, + # and lastly, the first argument sent in is the script name or + # filename of the script. In our case, the first argument is + # the string "launcher.py". If the user invoked the command + # as C:\Users\mguest\launcher.py then the first argument + # would be C:\Users\mguest\launcher.py. - if "--help" in args: - args.remove("--help") + # What this method (start_command_loop()) does is provide a + # REPL which is a + # read-eval-print-loop. It repeatedly asks the user for an + # input (read), evaluates that input into an action (evaluate), + # give the user some feedback (print), and start the process + # over again (loop). When you call "python", you are given a python + # process that gives you a REPL interactive shell. The way + # this wikicrawl app is implemented gives the user a REPL + # that has commands to interact with wikipedia pages. + args = [sys.argv[0], ] + inp.split() + + # The user can at any point in the command pass the argument + # switch "--help". If doing this, the command line interface + # will instead print out the inline documentation associated + # with this command and quit after doing so. For example, + # the user can type "python launcher.py do_random_page --help" + # and the program will spit out the generated documentation + # for the do_random_page command and run nothing. In our case, + # this documentation is created by the baker library and will + # print out the docstring associated with the method. Try it + # out in your shell (cmd.exe or powershell.exe) by invoking + # python launcher.py do_random_page --help + # You will see the program spit out the heredoc below the + # do_random_page method defined below. + + if '--help' in args: + args.remove('--help') try: - commander.usage(args[1]) + print('command usage:') + commander.usage(args[1]) + return except Exception as ex: print(type(ex), ex) continue @@ -71,13 +161,21 @@ class InteractiveInterface: @commander.command def do_random_page(self): + """ + Instructs the wikicrawl application to play the game on a random + article. + """ self.model.do_random_page() @commander.command def do_n_pages(self, n): + """ + Plays the wikicrawl game -times. + """ try: n = int(n) except ValueError as ex: + logging.warn('failed to process "%s" as a parameter' % n) return False for i in range(n): self.model.do_random_page() diff --git a/app/config.py b/app/config.py new file mode 100644 index 0000000..290d66f --- /dev/null +++ b/app/config.py @@ -0,0 +1,13 @@ +# config module defines a place to store the external configuration/settings +# and is used to provide an interface to the runtime configuration for the +# program. + +from . import log + +obj = {} + +def init(settings_obj): + global obj + obj = settings_obj + log.init_logging() + diff --git a/app/dal.py b/app/dal.py index f4da48e..84d7cbe 100644 --- a/app/dal.py +++ b/app/dal.py @@ -2,11 +2,7 @@ import sqlite3 import pycurl import os -settings = {} - -def init(settings_obj): - global settings - settings = settings_obj +from . import config class DataLayer: def __init__(self): diff --git a/app/log.py b/app/log.py index 42c7787..a62fb3d 100644 --- a/app/log.py +++ b/app/log.py @@ -1,11 +1,7 @@ import logging -settings = {} - -def init(settings_obj): - global settings - settings = settings_obj - init_logging() +from . import config def init_logging(): - logging.basicConfig(level=settings.LOG_LEVEL) + logging.basicConfig(level=config.obj.LOG_LEVEL) + diff --git a/app/model.py b/app/model.py index 85cb1d7..89810d1 100644 --- a/app/model.py +++ b/app/model.py @@ -1,43 +1,96 @@ +# model module contains the business logic of the program. Notice +# the command-line interface contains no business logic and only +# has functionality to call on the model. Similarly, the page +# objects define no specific functionality for doing the wiki crawl +# but only provide general utility methods that are called upon +# to implement the wiki crawl. This is a separation of concerns +# and keeps the logic organized and separated. + import logging import os import time from . import browser -from . import log +from . import config from . import dal +from . import log from . import pages -settings = {} - -def init(settings_obj): - global settings - settings = settings_obj - - browser.init(settings_obj) - dal.init(settings_obj) - pages.init(settings_obj) - log.init(settings_obj) - class Model: def __init__(self): self._webdriver = None @property def webdriver(self): + # The way this works is when an object instance of class/type Model + # is called with x.webdriver, Model runs webdriver(). In our case, + # the webdriver() method checks if a private variable self._webdriver + # exists and if it isn't, asks for a new selenim object. The result + # is that this will on-demand create a browser. If one exists, it will + # use the one that exists and if one doesn't exists, it will create + # one and use that. External code can rely on self.webdriver + # always existing with or without knowing if it exists because if it + # hasn't been created yet then it will be created on-the-fly. if not self._webdriver: - self._webdriver = browser.create_webdriver(settings.WEBDRIVER_BROWSER) + self._webdriver = browser.create_webdriver(config.obj.WEBDRIVER_BROWSER) page_api = pages.LandingPage(self.webdriver) page_api.goto_landing_page() return self._webdriver - def do_random_page(self): - # Landing page (select language) - page_api = pages.LandingPage(self.webdriver) - page_api.goto_landing_page() - page_api.select_language(settings.PAGE_LANGUAGE) + def open_browser(self): + x = self.webdriver # Request the browser open immediately. + # Without this, the Model object will + # be created on-demand (as defined in + # the Model class). This means that the + # web browser will not open until + # a command is typed in. But because we + # request the webdriver right here, Model + # creates it and then it is re-used later + # in the application. - # Main page + def do_random_page(self): + """ + Select a random page and repeatedly click the first link until + we reach the article on philosophy. Sometimes, the driver encounters + a loop and will never reach the page and sometimes the parser + fails and we fail to programmatically implement what we're trying to + do correctly. + """ + # The following 3 lines include the functionality + # for the Landing page (select language). + + # This line creates a new object (page_api) which is an instance + # of type class pages.LandingPage. LandingPage is a variable + # containing a class definition that is located in the pages module. + # We pass self.webdriver as arguments into the LandingPage.__init__ + # constructor. + page_api = pages.LandingPage(self.webdriver) + + # This line calls the page_api object's (an instance of + # pages.LandingPage type) method goto_landing_page. + page_api.goto_landing_page() + + # Similarly, this line calls the select_language method + # and passes in values from our runtime configuration. + # In this case, we have made the language a parameter + # that you can pass into the program, i.e. you can run it + # for English or Spanish or Russian or what have you. + page_api.select_language(config.obj.PAGE_LANGUAGE) + + # Main page: next 2 lines + + # At this point, we have clicked a link and changes the page. We + # re-create our page interface as a new object which is of + # a different class and includes distinct code for working with + # the page. In this case, we delete page_api and re-create it + # as an object of type pages.MainPage. Again, we pass in + # self.webdriver as an object of the selenium webdriver interface. + # The page_api calls methods on this webdriver to make the web + # browser do various things like click links or extract text. page_api = pages.MainPage(self.webdriver) + + # We call pages.MainPage.goto_random_article() to perform + # the action we're trying to invoke. page_api.goto_random_article() # Article page @@ -62,4 +115,3 @@ class Model: break print() - diff --git a/app/pages.py b/app/pages.py index bcf24a1..658d352 100644 --- a/app/pages.py +++ b/app/pages.py @@ -1,50 +1,109 @@ +# Pages module defines classes for interacting with wikipedia pages. +# There are separate classes defined for each page with their own +# defined methods for performing certain actions. + import logging import re import selenium -import selenium.webdriver import time -settings = {} - -def init(settings_obj): - global settings - settings = settings_obj +from . import browser +from . import config def breakpoint(): - if settings.DO_BREAKPOINTS: + """ + If DO_BREAKPOINTS is switched on, this will pause program + execution and wait for the user to press enter to continue. + """ + if config.obj.DO_BREAKPOINTS: input('Breakpoint here. to continue...') class PageRootObject: + """ + Common interface methods for working with pages. The specific + page classes below inherit these methods and define additional methods + so every page has available these methods and any additional + methods they define. + + In here are some re-used methods to click links and highlight + elements in the browser. + """ def __init__(self, driver=None): + """ + Object constructor for initializing the instance of this + class with internal variables needed. + + Args: + driver: Reference to the selenium webdriver object + that is used to interface with the web browser. + """ if not driver: - self.driver = create_webdriver() + self.driver = browser.create_webdriver() else: self.driver = driver - def click(self, el): + def click(self, el): + """ + Clicks a link in the browser and also highlights it to the + end user. + + Args: + el: selenium element to be clicked. Typically an anchor + html link in the page. + """ self.highlight(el, 'red') - time.sleep(settings.PAGE_DELAY) + time.sleep(config.obj.PAGE_DELAY) + breakpoint() el.click() def highlight(self, el, color): + """ + Highlights an html element in the web browser by changing the + background color as well as making the text bold. + + The implementation uses javascript to alter the css of the element. + + Args: + el: selenium element to be highlighted. + color: background color to highlight. Input can be one of + 'red', 'blue', or hex code such as '#ffffff'. + """ + # Note: The way hex codes work is there are 1 byte (2 hex characters) + # for every color. #RRGGBB for (red, green, blue). This can be thought + # of as an integer 0-255 for red, green, and blue in base-16 hexadecimal. if color == 'red': js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292' elif color == 'blue': js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff' + else: + js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color self.driver.execute_script(js, el) class LandingPage(PageRootObject): + """ + Interface for working with the wikipedia.org landing page. This page has links to + select a language and go to the respective wikipedia root page. + """ + + # Note: This is the LandingPage() object constructor. All it does right now is + # reference the parent (PageRootObject) constructor method and call it. This + # calls PageRootObject.__init__(driver) which makes the web driver available + # in the object instance. def __init__(self, driver=None): super().__init__(driver) def goto_landing_page(self): - self.driver.get(settings.PAGE_BASE_URL) + self.driver.get(config.obj.PAGE_BASE_URL) def select_language(self, language): link = self.driver.find_element_by_partial_link_text(language) self.click(link) class MainPage(PageRootObject): + """ + Interface for a selected language root page. This has the link to go to a random article + and has a featured article. An example url for this is https://en.wikipedia.org. + """ def __init__(self, driver=None): super().__init__(driver) @@ -53,7 +112,17 @@ class MainPage(PageRootObject): self.click(link) class ArticlePage(PageRootObject): + """ + Interface for a wikipedia article page. Here are defined some utility methods to + try and click the first valid link and extract some information from the page. + """ + # Here are static class-scoped variables that are needed to work with the page. + # These are used to locate html elements in the web browser. There are many + # ways to locate elements but one of the best if available is locating by id. It's + # not enforced but the html specification mandates that element id's are unique + # so if you can select by id in a semanticly correct web page, you can correctly + # select unique elements with high confidence. elements = { 'main-window-content-text-id': 'mw-content-text', 'article-title': 'firstHeading', @@ -69,7 +138,24 @@ class ArticlePage(PageRootObject): def click_first_link(self): return self._iterate_paragraphs() + # Note: Here this method has it's name prepended with a single underscore. + # This is a convention that communicates to the developer that these methods + # are internal private methods. That means they are not meant to be exposed + # to the external interface. Python does not restrict calling these methods. + # You can still call ArticlePage._iterate_paragraphs() but the prefix + # underscore tells you that it is not intended to be exposed and may be + # unsafe to call. Depending on the implementation, it may not make sense + # to directly call this method and may result in undefined and unexpected + # behavior. _iterate_paragraphs is called internally from the exposed + # click_first_link() but is never invoked externally. def _iterate_paragraphs(self): + """ + Iterates through paragraphs in the page and attempts to find the first + valid link. Sometimes the first paragraph does not have a link so this + needs to go through a few paragraphs and it does not make sense to + operate on the entire article every time when we're just looking for + the first link, for performance optimization. + """ main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id']) paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]') for p in paragraphs: @@ -89,8 +175,7 @@ class ArticlePage(PageRootObject): continue self.highlight(link, 'red') logging.info('selected link: %s' % link.text) - breakpoint() - link.click() + self.click(link) return True def _is_valid_link(self, p, el): @@ -98,45 +183,61 @@ class ArticlePage(PageRootObject): b = self._is_link_a_footnote(el) c = self._is_link_pronounciation(el) d = self._is_link_audio(el) - print(a, b, c, d) if not a and not b and not c and not d: return True return False def _is_link_in_parenthesis(self, p, el): - # link_text = el.text + """ + Determine if a given link element is inside a set + of textual parenthesis. + """ + # Implementation notes (mg): + # I've tried a few different ways to do this and it's + # hard to get it to work in every case. I want to avoid + # certain links and usually avoid links inside parenthetical + # notes. Some edge cases are nested parenthesis, links with + # non-english characters (which are displayed with a tree + # of elements in the html rather than a simply link). And + # sometimes, the link inside the parenthesis may be a valid + # target. I've made it so that skipped links show up as blue + # and determined-valid links highlight as red. link_text = el.get_attribute('outerHTML') p_text = p.get_attribute('innerHTML') - regex_str = '\(.*?\)' + regex_str = '\(.*?\)' # Regular expression to extract the + # text inside (not nested) parenthesis regex = re.compile(regex_str, flags=re.UNICODE) match = regex.search(p_text) if not match: + # There are no parenthesis at all in this paragraph. return False while match is not None: + # There may be multiple parenthesis (or nested). This + # iterates through them and checks if the links html + # is present inside these parenthesis. + # + # Care must be taken with regular expressions as they are + # user/developer unfriendly, hard-to-read, and unforgiving. + # For example, what happens when you try to match () + # inside of (some words) some more words (even more words), you + # can match unpaired parenthesis and the computer will return + # unexpected results. The code is quite dumb and does exactly + # what you tell it to. match_text = match.group(0) match_idx = match.end(0) - print(link_text) - print(match_text) if link_text in match_text: return True - match = regex.search(p_text, match_idx+1) - # Is the link inside parenthesis? - # regex_str = '\([^()]*%s[^())]*\)' % re.escape(link_text) - # regex_str = '\(.*%s.*\)' % link_text - # print(regex_str) - # regex = re.compile(regex_str, flags=re.UNICODE) - # match = re.search(regex_str, p_text) - # if match: # Pattern is found in the text - # print(match.group(0)) - # return True - # else: - # return False + return False def _is_link_a_footnote(self, el): + # Some links are anchors to footnotes, e.g. [1] that points to a source + # at the bottom of the page. These aren't valid links for our purpose + # so this method looks for that and determines if the reference element + # appears to be a link to a footnote. href = el.get_attribute('href') if '#cite_note' in href: return True @@ -145,12 +246,18 @@ class ArticlePage(PageRootObject): return False def _is_link_pronounciation(self, el): + # Some links point to the wikipedia IPA (international phonetic + # alphabet) pronounciation help page. We don't want to click these + # links so we scan for and ignore them. href = el.get_attribute('href') if '/wiki/Help:IPA' in href: return True return False def _is_link_audio(self, el): + # Some links are audio playback pronounciations. We look for these + # by checking for the file extension .ogg (an audio file format, + # ogg-vorbis) and ignoring links if they are of that type. href = el.get_attribute('href') if '.ogg' in href: return True diff --git a/launcher.py b/launcher.py index 9f7049e..a571da4 100644 --- a/launcher.py +++ b/launcher.py @@ -1,6 +1,9 @@ -import app.cli +import app import settings -app.cli.init(settings.Settings) -app.cli.main() +# Inject the settings.DefaultSettings object into the +# app and start running the program. +app.init(settings.DefaultSettings) +app.main() input(' to exit') + diff --git a/settings.py b/settings.py index b39979e..c2e9608 100644 --- a/settings.py +++ b/settings.py @@ -1,6 +1,16 @@ +# Application run-time configuration/settings. This contains variables +# that control how the program works but are kept separate from the +# program. It makes sense for certain parameters to be adjustable but +# not hard-coded into the application. For example, some users may want +# to run this program in English while others may want to run in Spanish. +# The way this works is we specify those variables external from the +# application (here) and pass them into the application (app.config module). +# The application then references app.config.obj to access the variables +# passed in from here. + import logging -class Settings: +class DefaultSettings: # Application Parameters LOG_LEVEL = logging.INFO DO_BREAKPOINTS = True