lots of documentation and safer functionality

This commit is contained in:
Mathew Guest 2017-08-22 17:50:33 -06:00
parent 64093c58a2
commit 5d88690ded
11 changed files with 392 additions and 99 deletions

@ -1,4 +1 @@
You need selenium-server installed and running:
java -jar /usr/share/selenium-server/selenium-server-standalone.jar -timeout 0

@ -1 +1,12 @@
# The __init__.py file signals to the python interpreter that the
# app directory is a package. A package is a special module that
# contains other modules. Each file is a module (browser, cli, etc.)
# and the "app" package is a module that contains other modules.
# The "app" module exports the stuff exposed here. We export
# app.init() as a reference to app.config.init() and app.main
# as a reference to app.cli.main
from .config import init
from .cli import main

@ -1,13 +1,25 @@
# browser module defines functions for creating selenium webdriver
# objects. The way this works is selenium is a third-party library
# that gives a common interface for interacting with web browsers,
# i.e. chrome, firefox, internet explorer, and even a pseudo-browser.
#
# This library (selenium) creates a web browser process
# (chrome will actually fire up for you to see) and gives you
# a webdriver object interface to programmatically control the browser.
# You can do things like click on links, extract information from the
# page, pass control to a user... the limit is your imagination.
import selenium
import selenium.webdriver
import logging
settings = {}
def init(settings_obj):
global settings
settings = settings_obj
from . import config
# This function has a parameter (driver) that passes in a value. In this case,
# this driver variable defaults to the string 'chrome'. The code can call
# create_webdriver() which is the same as create_webdriver('chrome') but
# can alternatively call create_webdriver('firefox') and get different
# functionality.
def create_webdriver(driver='chrome'):
if driver == 'chrome':
return create_webdriver_chrome()
@ -19,9 +31,7 @@ def create_webdriver_firefox():
def create_webdriver_chrome():
opt = selenium.webdriver.chrome.options.Options()
opt.add_argument('--user-agent=' + settings.WEBDRIVER_USER_AGENT)
opt.add_argument('--kiosk-printing')
opt.add_argument("--focus-existing-tab-on-open=false")
opt.add_argument('--user-agent=' + config.obj.WEBDRIVER_USER_AGENT)
driver = selenium.webdriver.Chrome(chrome_options = opt)
return driver

@ -1,4 +1,8 @@
#!/usr/bin/env python
# The command-line interface module creates a interface for
# interacting with the python program (wikicrawl). This is an implementation
# of the baker demo shown previously. The user can type in commands to
# make the program do things.
import baker
import logging
@ -6,18 +10,13 @@ import readline # Needed for command history <up> and <down> arrows to work
import sys
from . import model
from . import config
# Problem pages:
# Decision (from politics)
# Malaysia (goes inside parenthesis)
commander = baker.Baker()
settings = {}
def init(settings_obj):
global settings
settings = settings_obj
model.init(settings_obj)
def main():
user_interface = InteractiveInterface()
@ -31,9 +30,17 @@ def main():
class InteractiveInterface:
def __init__(self):
self.model = model.Model()
x = self.model.webdriver # Request the browser open immediately
def run(self, args, main=True):
"""
Runs the command-line interface for a single command.
If called by InteractiveInterface.run(sys.argv), this method
will execute the commands and arguments specified on command
line when running this program. Alternatively, the code could
pass in a different set of arguments to specify what to do.
See start_command_loop() for more information.
"""
try:
commander.run(argv=args, main=True, help_on_error=True,
instance=self)
@ -49,20 +56,103 @@ class InteractiveInterface:
def start_command_loop(self):
"""
Repeatedly asks the user what command to run until they exit.
This method calls InteractiveInterface.run(args) a little bit
differently. Instead of passing the arguments from the command-line
that were passed in when invoking the python wikicrawl app,
this asks the user for a line of textual input and passes
those strings to run() as the arguments. This way, the user can
access an interactive shell and repeatedly issue different
commands while the application is running.
"""
commander.usage()
self.model.open_browser()
while True:
print('$ ', end = '') # Display to the user a command prompt
# The dollar-sign is a common indication
# of a shell that communicates to the user
# that we are waiting for their textual
# input. The end = '' indicates to python
# to NOT drop to a newline after printing
# in the terminal. Instead, let the user
# type their command on the same line as
# our printed '$ '.
try:
inp = input()
except EOFError: # <ctrl>+D will send "End Line" and exit the command loop
break
args = ['', ] + inp.split()
# Note in arguments (mg):
# Whenever a program is run in windows or *nix, the operating
# system passes in the command string that was used to invoke
# the program. You can append data in that command to configure
# switches or values going into the program on the fly. For
# example, you can invoke this wikicrawl app in more than one
# way. You can of course run "python launcher.py" to run the
# software but you can also pass in an argument. You can
# alternatively run "python launcher.py <argument> <argument>..."
# and the operating system will provide the <argument> values into
# the process that is running.
#
# In a real world use case, many commands provide switches to
# adjust what the program does. For example,
#
# The command:
# find music -iname "*justin*bieber*"
# runs the "find" program and asks to find all the filenames that match the
# pattern *justin*bieber* in the "music" directory.
# (music, -iname, "*justin*biever*") are argument parameters
# that are passed into the program. The program is coded to
# parse and interpret these values and execute differently based
# on the values passed in. This is one way to pass in information
# into a running program. Some other ways are to read from a file
# (such as how we read from settings.py to load the runtime
# configuration), from something called environment variables
# (won't get into but another set of values provided to programs
# from the operating system), or they can be hard-coded into
# the application.
#
# Side note: arguments are not unique to python (almost all
# programming languages implement arguments), the functionality
# is defined by the application (some programs require arguments,
# some are optional, and the syntax for sending in argument
# parameters are different and defined by the individual programs,
# and lastly, the first argument sent in is the script name or
# filename of the script. In our case, the first argument is
# the string "launcher.py". If the user invoked the command
# as C:\Users\mguest\launcher.py then the first argument
# would be C:\Users\mguest\launcher.py.
if "--help" in args:
args.remove("--help")
# What this method (start_command_loop()) does is provide a
# REPL which is a
# read-eval-print-loop. It repeatedly asks the user for an
# input (read), evaluates that input into an action (evaluate),
# give the user some feedback (print), and start the process
# over again (loop). When you call "python", you are given a python
# process that gives you a REPL interactive shell. The way
# this wikicrawl app is implemented gives the user a REPL
# that has commands to interact with wikipedia pages.
args = [sys.argv[0], ] + inp.split()
# The user can at any point in the command pass the argument
# switch "--help". If doing this, the command line interface
# will instead print out the inline documentation associated
# with this command and quit after doing so. For example,
# the user can type "python launcher.py do_random_page --help"
# and the program will spit out the generated documentation
# for the do_random_page command and run nothing. In our case,
# this documentation is created by the baker library and will
# print out the docstring associated with the method. Try it
# out in your shell (cmd.exe or powershell.exe) by invoking
# python launcher.py do_random_page --help
# You will see the program spit out the heredoc below the
# do_random_page method defined below.
if '--help' in args:
args.remove('--help')
try:
print('command usage:')
commander.usage(args[1])
return
except Exception as ex:
print(type(ex), ex)
continue
@ -71,13 +161,21 @@ class InteractiveInterface:
@commander.command
def do_random_page(self):
"""
Instructs the wikicrawl application to play the game on a random
article.
"""
self.model.do_random_page()
@commander.command
def do_n_pages(self, n):
"""
Plays the wikicrawl game <n>-times.
"""
try:
n = int(n)
except ValueError as ex:
logging.warn('failed to process "%s" as a parameter' % n)
return False
for i in range(n):
self.model.do_random_page()

13
app/config.py Normal file

@ -0,0 +1,13 @@
# config module defines a place to store the external configuration/settings
# and is used to provide an interface to the runtime configuration for the
# program.
from . import log
obj = {}
def init(settings_obj):
global obj
obj = settings_obj
log.init_logging()

@ -2,11 +2,7 @@ import sqlite3
import pycurl
import os
settings = {}
def init(settings_obj):
global settings
settings = settings_obj
from . import config
class DataLayer:
def __init__(self):

@ -1,11 +1,7 @@
import logging
settings = {}
def init(settings_obj):
global settings
settings = settings_obj
init_logging()
from . import config
def init_logging():
logging.basicConfig(level=settings.LOG_LEVEL)
logging.basicConfig(level=config.obj.LOG_LEVEL)

@ -1,43 +1,96 @@
# model module contains the business logic of the program. Notice
# the command-line interface contains no business logic and only
# has functionality to call on the model. Similarly, the page
# objects define no specific functionality for doing the wiki crawl
# but only provide general utility methods that are called upon
# to implement the wiki crawl. This is a separation of concerns
# and keeps the logic organized and separated.
import logging
import os
import time
from . import browser
from . import log
from . import config
from . import dal
from . import log
from . import pages
settings = {}
def init(settings_obj):
global settings
settings = settings_obj
browser.init(settings_obj)
dal.init(settings_obj)
pages.init(settings_obj)
log.init(settings_obj)
class Model:
def __init__(self):
self._webdriver = None
@property
def webdriver(self):
# The way this works is when an object instance of class/type Model
# is called with x.webdriver, Model runs webdriver(). In our case,
# the webdriver() method checks if a private variable self._webdriver
# exists and if it isn't, asks for a new selenim object. The result
# is that this will on-demand create a browser. If one exists, it will
# use the one that exists and if one doesn't exists, it will create
# one and use that. External code can rely on self.webdriver
# always existing with or without knowing if it exists because if it
# hasn't been created yet then it will be created on-the-fly.
if not self._webdriver:
self._webdriver = browser.create_webdriver(settings.WEBDRIVER_BROWSER)
self._webdriver = browser.create_webdriver(config.obj.WEBDRIVER_BROWSER)
page_api = pages.LandingPage(self.webdriver)
page_api.goto_landing_page()
return self._webdriver
def do_random_page(self):
# Landing page (select language)
page_api = pages.LandingPage(self.webdriver)
page_api.goto_landing_page()
page_api.select_language(settings.PAGE_LANGUAGE)
def open_browser(self):
x = self.webdriver # Request the browser open immediately.
# Without this, the Model object will
# be created on-demand (as defined in
# the Model class). This means that the
# web browser will not open until
# a command is typed in. But because we
# request the webdriver right here, Model
# creates it and then it is re-used later
# in the application.
# Main page
def do_random_page(self):
"""
Select a random page and repeatedly click the first link until
we reach the article on philosophy. Sometimes, the driver encounters
a loop and will never reach the page and sometimes the parser
fails and we fail to programmatically implement what we're trying to
do correctly.
"""
# The following 3 lines include the functionality
# for the Landing page (select language).
# This line creates a new object (page_api) which is an instance
# of type class pages.LandingPage. LandingPage is a variable
# containing a class definition that is located in the pages module.
# We pass self.webdriver as arguments into the LandingPage.__init__
# constructor.
page_api = pages.LandingPage(self.webdriver)
# This line calls the page_api object's (an instance of
# pages.LandingPage type) method goto_landing_page.
page_api.goto_landing_page()
# Similarly, this line calls the select_language method
# and passes in values from our runtime configuration.
# In this case, we have made the language a parameter
# that you can pass into the program, i.e. you can run it
# for English or Spanish or Russian or what have you.
page_api.select_language(config.obj.PAGE_LANGUAGE)
# Main page: next 2 lines
# At this point, we have clicked a link and changes the page. We
# re-create our page interface as a new object which is of
# a different class and includes distinct code for working with
# the page. In this case, we delete page_api and re-create it
# as an object of type pages.MainPage. Again, we pass in
# self.webdriver as an object of the selenium webdriver interface.
# The page_api calls methods on this webdriver to make the web
# browser do various things like click links or extract text.
page_api = pages.MainPage(self.webdriver)
# We call pages.MainPage.goto_random_article() to perform
# the action we're trying to invoke.
page_api.goto_random_article()
# Article page
@ -62,4 +115,3 @@ class Model:
break
print()

@ -1,50 +1,109 @@
# Pages module defines classes for interacting with wikipedia pages.
# There are separate classes defined for each page with their own
# defined methods for performing certain actions.
import logging
import re
import selenium
import selenium.webdriver
import time
settings = {}
def init(settings_obj):
global settings
settings = settings_obj
from . import browser
from . import config
def breakpoint():
if settings.DO_BREAKPOINTS:
"""
If DO_BREAKPOINTS is switched on, this will pause program
execution and wait for the user to press enter to continue.
"""
if config.obj.DO_BREAKPOINTS:
input('Breakpoint here. <Enter> to continue...')
class PageRootObject:
"""
Common interface methods for working with pages. The specific
page classes below inherit these methods and define additional methods
so every page has available these methods and any additional
methods they define.
In here are some re-used methods to click links and highlight
elements in the browser.
"""
def __init__(self, driver=None):
"""
Object constructor for initializing the instance of this
class with internal variables needed.
Args:
driver: Reference to the selenium webdriver object
that is used to interface with the web browser.
"""
if not driver:
self.driver = create_webdriver()
self.driver = browser.create_webdriver()
else:
self.driver = driver
def click(self, el):
"""
Clicks a link in the browser and also highlights it to the
end user.
Args:
el: selenium element to be clicked. Typically an anchor
html link in the page.
"""
self.highlight(el, 'red')
time.sleep(settings.PAGE_DELAY)
time.sleep(config.obj.PAGE_DELAY)
breakpoint()
el.click()
def highlight(self, el, color):
"""
Highlights an html element in the web browser by changing the
background color as well as making the text bold.
The implementation uses javascript to alter the css of the element.
Args:
el: selenium element to be highlighted.
color: background color to highlight. Input can be one of
'red', 'blue', or hex code such as '#ffffff'.
"""
# Note: The way hex codes work is there are 1 byte (2 hex characters)
# for every color. #RRGGBB for (red, green, blue). This can be thought
# of as an integer 0-255 for red, green, and blue in base-16 hexadecimal.
if color == 'red':
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292'
elif color == 'blue':
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
else:
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
self.driver.execute_script(js, el)
class LandingPage(PageRootObject):
"""
Interface for working with the wikipedia.org landing page. This page has links to
select a language and go to the respective wikipedia root page.
"""
# Note: This is the LandingPage() object constructor. All it does right now is
# reference the parent (PageRootObject) constructor method and call it. This
# calls PageRootObject.__init__(driver) which makes the web driver available
# in the object instance.
def __init__(self, driver=None):
super().__init__(driver)
def goto_landing_page(self):
self.driver.get(settings.PAGE_BASE_URL)
self.driver.get(config.obj.PAGE_BASE_URL)
def select_language(self, language):
link = self.driver.find_element_by_partial_link_text(language)
self.click(link)
class MainPage(PageRootObject):
"""
Interface for a selected language root page. This has the link to go to a random article
and has a featured article. An example url for this is https://en.wikipedia.org.
"""
def __init__(self, driver=None):
super().__init__(driver)
@ -53,7 +112,17 @@ class MainPage(PageRootObject):
self.click(link)
class ArticlePage(PageRootObject):
"""
Interface for a wikipedia article page. Here are defined some utility methods to
try and click the first valid link and extract some information from the page.
"""
# Here are static class-scoped variables that are needed to work with the page.
# These are used to locate html elements in the web browser. There are many
# ways to locate elements but one of the best if available is locating by id. It's
# not enforced but the html specification mandates that element id's are unique
# so if you can select by id in a semanticly correct web page, you can correctly
# select unique elements with high confidence.
elements = {
'main-window-content-text-id': 'mw-content-text',
'article-title': 'firstHeading',
@ -69,7 +138,24 @@ class ArticlePage(PageRootObject):
def click_first_link(self):
return self._iterate_paragraphs()
# Note: Here this method has it's name prepended with a single underscore.
# This is a convention that communicates to the developer that these methods
# are internal private methods. That means they are not meant to be exposed
# to the external interface. Python does not restrict calling these methods.
# You can still call ArticlePage._iterate_paragraphs() but the prefix
# underscore tells you that it is not intended to be exposed and may be
# unsafe to call. Depending on the implementation, it may not make sense
# to directly call this method and may result in undefined and unexpected
# behavior. _iterate_paragraphs is called internally from the exposed
# click_first_link() but is never invoked externally.
def _iterate_paragraphs(self):
"""
Iterates through paragraphs in the page and attempts to find the first
valid link. Sometimes the first paragraph does not have a link so this
needs to go through a few paragraphs and it does not make sense to
operate on the entire article every time when we're just looking for
the first link, for performance optimization.
"""
main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id'])
paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]')
for p in paragraphs:
@ -89,8 +175,7 @@ class ArticlePage(PageRootObject):
continue
self.highlight(link, 'red')
logging.info('selected link: %s' % link.text)
breakpoint()
link.click()
self.click(link)
return True
def _is_valid_link(self, p, el):
@ -98,45 +183,61 @@ class ArticlePage(PageRootObject):
b = self._is_link_a_footnote(el)
c = self._is_link_pronounciation(el)
d = self._is_link_audio(el)
print(a, b, c, d)
if not a and not b and not c and not d:
return True
return False
def _is_link_in_parenthesis(self, p, el):
# link_text = el.text
"""
Determine if a given link element is inside a set
of textual parenthesis.
"""
# Implementation notes (mg):
# I've tried a few different ways to do this and it's
# hard to get it to work in every case. I want to avoid
# certain links and usually avoid links inside parenthetical
# notes. Some edge cases are nested parenthesis, links with
# non-english characters (which are displayed with a tree
# of elements in the html rather than a simply link). And
# sometimes, the link inside the parenthesis may be a valid
# target. I've made it so that skipped links show up as blue
# and determined-valid links highlight as red.
link_text = el.get_attribute('outerHTML')
p_text = p.get_attribute('innerHTML')
regex_str = '\(.*?\)'
regex_str = '\(.*?\)' # Regular expression to extract the
# text inside (not nested) parenthesis
regex = re.compile(regex_str, flags=re.UNICODE)
match = regex.search(p_text)
if not match:
# There are no parenthesis at all in this paragraph.
return False
while match is not None:
# There may be multiple parenthesis (or nested). This
# iterates through them and checks if the links html
# is present inside these parenthesis.
#
# Care must be taken with regular expressions as they are
# user/developer unfriendly, hard-to-read, and unforgiving.
# For example, what happens when you try to match (<anything>)
# inside of (some words) some more words (even more words), you
# can match unpaired parenthesis and the computer will return
# unexpected results. The code is quite dumb and does exactly
# what you tell it to.
match_text = match.group(0)
match_idx = match.end(0)
print(link_text)
print(match_text)
if link_text in match_text:
return True
match = regex.search(p_text, match_idx+1)
# Is the link inside parenthesis?
# regex_str = '\([^()]*<a.*?>%s</a>[^())]*\)' % re.escape(link_text)
# regex_str = '\(.*<a.*?>%s.*\)' % link_text
# print(regex_str)
# regex = re.compile(regex_str, flags=re.UNICODE)
# match = re.search(regex_str, p_text)
# if match: # Pattern is found in the text
# print(match.group(0))
# return True
# else:
# return False
return False
def _is_link_a_footnote(self, el):
# Some links are anchors to footnotes, e.g. [1] that points to a source
# at the bottom of the page. These aren't valid links for our purpose
# so this method looks for that and determines if the reference element
# appears to be a link to a footnote.
href = el.get_attribute('href')
if '#cite_note' in href:
return True
@ -145,12 +246,18 @@ class ArticlePage(PageRootObject):
return False
def _is_link_pronounciation(self, el):
# Some links point to the wikipedia IPA (international phonetic
# alphabet) pronounciation help page. We don't want to click these
# links so we scan for and ignore them.
href = el.get_attribute('href')
if '/wiki/Help:IPA' in href:
return True
return False
def _is_link_audio(self, el):
# Some links are audio playback pronounciations. We look for these
# by checking for the file extension .ogg (an audio file format,
# ogg-vorbis) and ignoring links if they are of that type.
href = el.get_attribute('href')
if '.ogg' in href:
return True

@ -1,6 +1,9 @@
import app.cli
import app
import settings
app.cli.init(settings.Settings)
app.cli.main()
# Inject the settings.DefaultSettings object into the
# app and start running the program.
app.init(settings.DefaultSettings)
app.main()
input('<enter> to exit')

@ -1,6 +1,16 @@
# Application run-time configuration/settings. This contains variables
# that control how the program works but are kept separate from the
# program. It makes sense for certain parameters to be adjustable but
# not hard-coded into the application. For example, some users may want
# to run this program in English while others may want to run in Spanish.
# The way this works is we specify those variables external from the
# application (here) and pass them into the application (app.config module).
# The application then references app.config.obj to access the variables
# passed in from here.
import logging
class Settings:
class DefaultSettings:
# Application Parameters
LOG_LEVEL = logging.INFO
DO_BREAKPOINTS = True