colored logging, multiple selenium drivers, multi language support

2026-01-11 10:16:29 -07:00 · 2017-08-25 18:09:46 -06:00 · 2017-08-25 18:09:46 -06:00 · 4aa965cfc8
commit 4aa965cfc8
parent 5d88690ded
17 changed files with 421 additions and 113 deletions
--- a/app/init.py
+++ b/app/init.py
@ -1,12 +0,0 @@
-# The __init__.py file signals to the python interpreter that the
-# app directory is a package. A package is a special module that
-# contains other modules. Each file is a module (browser, cli, etc.)
-# and the "app" package is a module that contains other modules.
-
-# The "app" module exports the stuff exposed here. We export
-# app.init() as a reference to app.config.init() and app.main
-# as a reference to app.cli.main
-
-from .config import init
-from .cli import main
-
--- a/app/log.py
+++ b/app/log.py
@ -1,7 +0,0 @@
-import logging
-
-from . import config
-
-def init_logging():
-    logging.basicConfig(level=config.obj.LOG_LEVEL)
-
--- a/launcher.py
+++ b/launcher.py
@ -1,9 +1,9 @@
-import app
+import wikicrawl
 import settings

 # Inject the settings.DefaultSettings object into the
 # app and start running the program.
-app.init(settings.DefaultSettings)
-app.main()
+wikicrawl.init(settings.DefaultSettings)
+wikicrawl.main()
 input('<enter> to exit')

--- a/settings.py
+++ b/settings.py
@ -4,28 +4,107 @@
 # not hard-coded into the application. For example, some users may want
 # to run this program in English while others may want to run in Spanish.
 # The way this works is we specify those variables external from the
-# application (here) and pass them into the application (app.config module).
-# The application then references app.config.obj to access the variables
+# application (here) and pass them into the application (wikicrawl.config module).
+# The application then references wikicrawl.config.obj to access the variables
 # passed in from here.

+import colorlog
 import logging
+import logging.config

 class DefaultSettings:
+    # Filepath parameters - THESE MUST EXIST OR PROGRAM WILL NOT RUN!!
+    LOG_FILENAME = '/tmp/wikicrawl.log'
+    SQLITE_DBFILE = '/home/mathew/.wikicrawler.db'
+
    # Application Parameters
-    LOG_LEVEL = logging.INFO
-    DO_BREAKPOINTS = True
+    DO_BREAKPOINTS = False
    PAGE_DELAY = 0

    # Web Driver Parameters
    WEBDRIVER_USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)'
-    WEBDRIVER_BROWSER = 'chrome' # Options are 'chrome', 'firefox'
+
+    # Requested browser and webdriver dependencies are required for this to work.
+    # This means you need to have installed on your system:
+    # Chrome + WebDriver for Chrome
+    # Firefox + geckodriver for Firefox
+    # phantomjs for phantom
+    WEBDRIVER_BROWSER = 'chrome' # Options are 'chrome', 'firefox', 'phantom'

    # Wikipedia Parameters
    PAGE_BASE_URL = 'https://www.wikipedia.org/'
-    PAGE_LANGUAGE = 'English'
-    # PAGE_LANGUAGE = 'Español'
-    # PAGE_LANGUAGE = 'Русский'

-    # Data Layer Parameters
-    SQLITE_DBFILE = '/home/mathew/.wikicrawler.db'
+    # Supported Languages so far:
+    # German, English, Spanish, French, Italian, Portuguese, Polish, Russian
+    # 'de', 'en', 'es', 'fr', 'it', 'pl', 'pt', 'ru'
+    PAGE_LANGUAGE = 'en'
+
+    # API Keys
+    YANDEX_API_KEY = 'trnsl.1.1.20170825T194642Z.26862b9dd4c1a755.9490ed28de448ff67522c2854f262eff05ec0dc3'
+
+    # Logging Parameters
+    LOG_SETTINGS = {
+        'version': 1, # version is always 1
+        'formatters': {
+            'colored': {
+                '()': 'colorlog.ColoredFormatter',
+                'format': '%(log_color)s%(levelname)-8s%(reset)s:%(log_color)s%(name)-5s%(reset)s:%(blue)s%(message)s'
+            },
+            'basic': {
+                '()': 'logging.Formatter',
+                'format': '%(levelname)s:%(name)s:%(asctime)s:%(message)s'
+            }
+        },
+        'handlers': {
+            'stderr': {
+                'class': 'logging.StreamHandler',
+                # The handler level will override the logger level if higher.
+                # That is, if the logger level is set to pass through DEBUG
+                # and higher and the handler is set to only pass through WARNING
+                # and higher, DEBUG messages will not pass through to this loggers
+                # handler. You can configure multiple handlers for any logger so
+                # for example you could log WARNINGS and ERRORS to a file but
+                # not save all the DEBUG messages.
+                'level': logging.DEBUG,
+                'formatter': 'colored'
+            },
+            'file': {
+                'class': 'logging.handlers.RotatingFileHandler',
+                'level': logging.INFO,
+                'formatter': 'basic',
+                'filename': LOG_FILENAME,
+                'maxBytes': 32768,
+                'backupCount': 3
+            }
+        },
+        'loggers': {
+            # Root Logger
+            '': {
+                'level': logging.DEBUG,
+                'handlers': ['file'],
+            },
+            'main': {
+                'level': logging.DEBUG,
+                'handlers': ['stderr'],
+                'propagate': False
+            },
+            'model': {
+                'level': logging.DEBUG,
+                'handlers': ['stderr'],
+                'propagate': True
+            },
+            'cli': {
+                'level': logging.DEBUG,
+                'handlers': ['stderr'],
+                'propagate': False
+            },
+            'pages': {
+                'level': logging.INFO,
+                'handlers': ['stderr'],
+                'propagate': False
+            }
+        }
+    }
+
+

--- a/setup.py
+++ b/setup.py
@ -0,0 +1,23 @@
+#!/usr/bin/env python
+# setup.py is the install script for this application. This will download
+# required third-party dependencies and package the app. You can also
+# install the application system-wide.
+
+from setuptools import setup
+
+__project__ = 'wikicrawl'
+# If you're looking for a versioning scheme, one revered pattern
+# can be read about at http://semver.org
+__version__ = '0.9.0'
+
+setup(name = __project__,
+      version = __version__,
+      description = '',
+      author = '',
+      author_email = '',
+      url = '',
+      install_requires = ('yandex.translate',
+                          'selenium',
+                          ),
+      packages = ('wikicrawl',))
+
--- a/wikicrawl/.dal.py.swp
+++ b/wikicrawl/.dal.py.swp
--- a/wikicrawl/init.py
+++ b/wikicrawl/init.py
@ -0,0 +1,12 @@
+# The __init__.py file signals to the python interpreter that the
+# app directory is a package. A package is a special module that
+# contains other modules. Each file is a module (browser, cli, etc.)
+# and the "wikicrawl" package is a module that contains other modules.
+
+# The wikicrawl package, which is a module, exports the stuff exposed here.
+# We export config.init() as a reference to wikicrawl.config.init() and
+# wikicrawl.main as a reference to wikicrawl.cli.main
+
+from .config import init
+from .main import main
+
--- a/wikicrawl/assets/languages.py
+++ b/wikicrawl/assets/languages.py
@ -0,0 +1,39 @@
+
+LANGUAGES = {
+    'az': '',
+    'be': '',
+    'bg': '',
+    'ca': '',
+    'cs': '',
+    'da': '',
+    'de': 'Deutsch',
+    'el': '',
+    'en': 'English',
+    'es': 'Español',
+    'et': '',
+    'fi': '',
+    'fr': 'Français',
+    'hr': '',
+    'hu': '',
+    'hy': '',
+    'it': 'Italiano',
+    # 'ja': '日本語', -- no japanese in yandex
+    'lt': '',
+    'lv': '',
+    'mk': '',
+    'nl': '',
+    'no': '',
+    'pl': 'Polski',
+    'pt': 'Português',
+    'ro': '',
+    'ru': 'Русский',
+    'sk': '',
+    'sl': '',
+    'sq': '',
+    'sr': '',
+    'sv': '',
+    'tr': '',
+    'uk': '',
+    # 'zh': '中文' -- no chinese
+}
+
--- a/wikicrawl/browser.py
+++ b/wikicrawl/browser.py
@ -11,9 +11,9 @@

 import selenium
 import selenium.webdriver
-import logging

 from . import config
+from . import log

 # This function has a parameter (driver) that passes in a value. In this case,
 # this driver variable defaults to the string 'chrome'. The code can call
@ -25,9 +25,17 @@ def create_webdriver(driver='chrome'):
        return create_webdriver_chrome()
    elif driver == 'firefox':
        return create_webdriver_firefox()
+    elif driver == 'phantom':
+        return create_webdriver_phantom()
+    else:
+        log.LOGGER('browser').error('unable to handle webdriver request: %s' % driver)
+        return

 def create_webdriver_firefox():
-    pass
+    profile = selenium.webdriver.FirefoxProfile()
+    profile.set_preference("general.useragent.override", config.obj.WEBDRIVER_USER_AGENT)
+    driver = selenium.webdriver.Firefox(profile)
+    return driver

 def create_webdriver_chrome():
    opt = selenium.webdriver.chrome.options.Options()
@ -35,3 +43,7 @@ def create_webdriver_chrome():
    driver = selenium.webdriver.Chrome(chrome_options = opt)
    return driver

+def create_webdriver_phantom():
+    driver = selenium.webdriver.PhantomJS()
+    return driver
+
--- a/wikicrawl/cli.py
+++ b/wikicrawl/cli.py
@ -1,37 +1,42 @@
-#!/usr/bin/env python
-# The command-line interface module creates a interface for
+# The command-line interface module creates an interface for
 # interacting with the python program (wikicrawl). This is an implementation
 # of the baker demo shown previously. The user can type in commands to
 # make the program do things.

 import baker
-import logging
-import readline # Needed for command history <up> and <down> arrows to work
 import sys

+
+if sys.platform == 'linux':
+    import readline # Needed for command history <up> and <down> arrows to work
+
+from . import log
 from . import model
 from . import config

 # Problem pages:
 # Decision (from politics)
 # Malaysia (goes inside parenthesis)
+# Soft-sediment_deformation_structures (doesn't find link)
+# Chemicals (loops at philosophical)

 commander = baker.Baker()

-def main():
-    user_interface = InteractiveInterface()
-    
-    if len(sys.argv) > 1: # Command line arguments were passed in
-                          # command-line when invoking python
-        user_interface.run(sys.argv)
-    else:
-        user_interface.start_command_loop()
-
 class InteractiveInterface:
    def __init__(self):
+        # Instantiate the variable self.model as an object
+        # of instance of the Model class defined in the model
+        # module. model.Model refers to the Model class in the
+        # model module and this line creates a new variable (self.model)
+        # which is a variable that is an instance of Model, i.e.
+        # it has the type Model and has Model.methods() available
+        # to it.
+        #
+        # self.model is a variable that is attached to the instance/object
+        # returned by this constructor that has the type InteractiveInterface.
        self.model = model.Model()

-    def run(self, args, main=True):
+    def run_command(self, args, main=True):
        """
        Runs the command-line interface for a single command.

@ -45,13 +50,13 @@ class InteractiveInterface:
            commander.run(argv=args, main=True, help_on_error=True,
                          instance=self)
        except baker.CommandError as ex:
-            logging.warn('incorrect user input: %s' % ex)
+            log.LOGGER['cli'].warn('incorrect user input: %s' % ex)
            commander.usage()
        except baker.TopHelp as ex:
            commander.usage()
        except Exception as ex:
-            logging.error('caught general exception!!')
-            print(type(ex), ex)
+            log.LOGGER['cli'].error('caught general exception!!')
+            log.LOGGER['cli'].error(type(ex), ex)

    def start_command_loop(self):
        """
@ -76,12 +81,13 @@ class InteractiveInterface:
                                  # to NOT drop to a newline after printing
                                  # in the terminal. Instead, let the user
                                  # type their command on the same line as
-                                  # our printed '$ '.
+                                  # the printed '$ '.
            try:
                inp = input()
            except EOFError: # <ctrl>+D will send "End Line" and exit the command loop
                break
-            # Note in arguments (mg):
+
+            # Note on "arguments" (mg):
            # Whenever a program is run in windows or *nix, the operating
            # system passes in the command string that was used to invoke
            # the program. You can append data in that command to configure
@ -91,16 +97,16 @@ class InteractiveInterface:
            # software but you can also pass in an argument. You can
            # alternatively run "python launcher.py <argument> <argument>..."
            # and the operating system will provide the <argument> values into
-            # the process that is running.
+            # the process that is running as variables.
            #
            # In a real world use case, many commands provide switches to
            # adjust what the program does. For example,
            #
            # The command:
-            #     find music -iname "*justin*bieber*"
+            #     find music -name "*justin*bieber*"
            # runs the "find" program and asks to find all the filenames that match the
            # pattern *justin*bieber* in the "music" directory.
-            # (music, -iname, "*justin*biever*") are argument parameters
+            # (music, -name, "*justin*biever*") are argument parameters
            # that are passed into the program. The program is coded to
            # parse and interpret these values and execute differently based
            # on the values passed in. This is one way to pass in information
@ -123,14 +129,21 @@ class InteractiveInterface:
            # would be C:\Users\mguest\launcher.py.

            # What this method (start_command_loop()) does is provide a
-            # REPL which is a
+            # REPL shell which is a
            # read-eval-print-loop. It repeatedly asks the user for an
            # input (read), evaluates that input into an action (evaluate),
            # give the user some feedback (print), and start the process
-            # over again (loop). When you call "python", you are given a python
-            # process that gives you a REPL interactive shell. The way
+            # over again (loop). When you call just "python", you are loading a
+            # program that gives you a REPL interactive shell. The way
            # this wikicrawl app is implemented gives the user a REPL
            # that has commands to interact with wikipedia pages.
+
+            # Because we take in the input as a single string, we do
+            # a transformation to turn something like "do_random_page 5"
+            # into ["launcher.py", "do_random_page", "5"] which is how
+            # the arguments array would have been created if it were
+            # passed in the initial command instead of typed and interpretted
+            # as input as is done here.
            args = [sys.argv[0], ] + inp.split()

            # The user can at any point in the command pass the argument
@ -146,40 +159,42 @@ class InteractiveInterface:
            #     python launcher.py do_random_page --help
            # You will see the program spit out the heredoc below the
            # do_random_page method defined below.
-
            if '--help' in args:
                args.remove('--help')
                try:
                    print('command usage:')
                    commander.usage(args[1])
-                    return
                except Exception as ex:
                    print(type(ex), ex)
                continue

-            self.run(args, main=False)
+            self.run_command(args, main=False)

    @commander.command
-    def do_random_page(self):
+    def play_random_page(self):
        """
        Instructs the wikicrawl application to play the game on a random
        article.
        """
-        self.model.do_random_page()
+        self.model.play_random_page()

    @commander.command
-    def do_n_pages(self, n):
+    def play_multiple(self, n):
        """
        Plays the wikicrawl game <n>-times.
        """
        try:
            n = int(n)
        except ValueError as ex:
-            logging.warn('failed to process "%s" as a parameter' % n)
+            log.LOGGER['cli'].warn('failed to process "%s" as a parameter' % n)
            return False
        for i in range(n):
-            self.model.do_random_page()
+            self.model.play_random_page()

-if __name__ == '__main__':
-    main()
+    @commander.command
+    def exit(self):
+        """
+        Immediately exit the program.
+        """
+        sys.exit(0)

--- a/wikicrawl/config.py
+++ b/wikicrawl/config.py
--- a/wikicrawl/dal.py
+++ b/wikicrawl/dal.py
--- a/wikicrawl/log.py
+++ b/wikicrawl/log.py
@ -0,0 +1,33 @@
+# log module is a wrapper around third-party colorlog library
+# and provides an application-level interface to a logging system.
+
+import colorlog
+import logging
+
+from . import config
+
+# Default python log severity levels:
+# CRITICAL
+# ERROR
+# WARNING
+# INFO
+# DEBUG
+
+LOGGER = None
+
+class LoggingLayer:
+    def __init__(self, config):
+        self.loggers = {}
+        logging.config.dictConfig(config)
+
+    def __getitem__(self, k):
+        logger = self.loggers.get(k)
+        if not logger:
+            logger = logging.getLogger(k)
+            self.loggers[k] = logger
+        return logger
+
+def init_logging():
+    global LOGGER
+    LOGGER = LoggingLayer(config.obj.LOG_SETTINGS)
+
--- a/wikicrawl/main.py
+++ b/wikicrawl/main.py
@ -0,0 +1,19 @@
+#!/usr/bin/env python
+
+import sys
+
+from . import cli
+from . import util
+
+def main():
+    user_interface = cli.InteractiveInterface()
+
+    if len(sys.argv) > 1: # Command line arguments were passed in
+                          # command-line when invoking python
+        user_interface.run_command(sys.argv)
+    else:
+        user_interface.start_command_loop()
+
+if __name__ == '__main__':
+    main()
+
--- a/wikicrawl/model.py
+++ b/wikicrawl/model.py
@ -6,7 +6,6 @@
 # to implement the wiki crawl. This is a separation of concerns
 # and keeps the logic organized and separated.

-import logging
 import os
 import time

@ -15,10 +14,12 @@ from . import config
 from . import dal
 from . import log
 from . import pages
+from . import util

 class Model:
    def __init__(self):
        self._webdriver = None
+        self._translated_philosophy = None

    @property
    def webdriver(self):
@ -37,6 +38,16 @@ class Model:
            page_api.goto_landing_page()
        return self._webdriver

+    @property
+    def translated_philosophy(self):
+        # This translates 'philosophy' to the target language with only 1 api call.
+        if config.obj.PAGE_LANGUAGE == 'en':
+            self._translated_philosophy = 'philosophy'
+        elif not self._translated_philosophy:
+            text = util.translate_text('en', config.obj.PAGE_LANGUAGE, 'philosophy')
+            self._translated_philosophy = text
+        return self._translated_philosophy
+
    def open_browser(self):
        x = self.webdriver # Request the browser open immediately.
                           # Without this, the Model object will
@ -48,7 +59,7 @@ class Model:
                           # creates it and then it is re-used later
                           # in the application.

-    def do_random_page(self):
+    def play_random_page(self):
        """
        Select a random page and repeatedly click the first link until
        we reach the article on philosophy. Sometimes, the driver encounters
@ -93,25 +104,50 @@ class Model:
        # the action we're trying to invoke.
        page_api.goto_random_article()

-        # Article page
+        # Article pages
        pages_visited = []
+
+        # We just need translated_title to exist
+        translated_title = None
        while True:
            page_api = pages.ArticlePage(self.webdriver)

+            # Get the article title (and translate if necessary)
            title = page_api.get_title()
-            logging.debug('visited page: %s' % title)
+            if config.obj.PAGE_LANGUAGE != 'en':
+                translated_title = util.translate_text(config.obj.PAGE_LANGUAGE, 'en', title)
+                log.LOGGER['model'].info('visited page: %s (%s)' % (title, translated_title))
+            else:
+                log.LOGGER['model'].info('visited page: %s' % title)
+
+            # Check for page loops (have we already visisted this page?)
            if title in pages_visited:
-                logging.info('encountered loop at page = %s' % title)
+                log.LOGGER['model'].info('encountered loop at page = %s' % title)
                break
-            if title == 'Philosophy':
-                logging.info('made it to philosophy in %s pages' % len(pages_visited))
+
+            # Check if we reached the article on philosophy
+            if self._is_article_on_philosophy(title, translated_title):
+                log.LOGGER['model'].info('made it to philosophy in %s pages' % len(pages_visited))
                pages_visited.append(title)
                break
+
+            # Store the result of what articles have been navigated
            pages_visited.append(title)

            rc = page_api.click_first_link()
            if not rc:
-                logging.warn('failure: unable to continue (perhaps no valid links?)')
+                log.LOGGER['model'].warn('failure: unable to continue (perhaps no valid links?)')
                break
            print()

+    def _is_article_on_philosophy(self, title, translated_title):
+        """
+        Checks both the original title and the translated (to english) title to
+        see if they seem to be the page on philosophy.
+        """
+        if title.lower() == self.translated_philosophy.lower():
+            return True
+        if translated_title and translated_title.lower() == 'philosophy':
+            return True
+        return False
+
--- a/wikicrawl/pages.py
+++ b/wikicrawl/pages.py
@ -1,22 +1,16 @@
-# Pages module defines classes for interacting with wikipedia pages.
+# pages module defines classes for interacting with wikipedia pages.
 # There are separate classes defined for each page with their own
 # defined methods for performing certain actions.

-import logging
 import re
 import selenium
 import time

 from . import browser
 from . import config
-
-def breakpoint():
-    """
-    If DO_BREAKPOINTS is switched on, this will pause program
-    execution and wait for the user to press enter to continue.
-    """
-    if config.obj.DO_BREAKPOINTS:
-        input('Breakpoint here. <Enter> to continue...')
+from . import log
+from . import util
+from .assets.languages import LANGUAGES

 class PageRootObject:
    """
@ -28,7 +22,7 @@ class PageRootObject:
    In here are some re-used methods to click links and highlight
    elements in the browser.
    """
-    def __init__(self, driver=None):
+    def __init__(self, driver):
        """
        Object constructor for initializing the instance of this
        class with internal variables needed.
@ -37,9 +31,6 @@ class PageRootObject:
            driver: Reference to the selenium webdriver object
        that is used to interface with the web browser.
        """
-        if not driver:
-            self.driver = browser.create_webdriver()
-        else:
        self.driver = driver

    def click(self, el): 
@ -49,11 +40,11 @@ class PageRootObject:

        Args:
            el: selenium element to be clicked. Typically an anchor
-        html link in the page.
+        html link in the webpage.
        """
        self.highlight(el, 'red')
        time.sleep(config.obj.PAGE_DELAY)
-        breakpoint()
+        util.breakpoint()
        el.click()

    def highlight(self, el, color):
@ -68,36 +59,55 @@ class PageRootObject:
            color: background color to highlight. Input can be one of
        'red', 'blue', or hex code such as '#ffffff'.
        """
-        # Note: The way hex codes work is there are 1 byte (2 hex characters)
+        # Note: The way hex codes work is there is 1 byte (2 hex characters)
        # for every color. #RRGGBB for (red, green, blue). This can be thought
        # of as an integer 0-255 for red, green, and blue in base-16 hexadecimal.
+        # For example, #ff0000 is bright red while #002f00 is light green
+        # and #ffff00 is full yellow.
        if color == 'red':
-            js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292'
+            color = '#ff9292'
+            # js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292'
        elif color == 'blue':
-            js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
+            color = '#9292ff'
+            # js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
        else:
+            # color = color
+            js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
        js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
        self.driver.execute_script(js, el)

+# Note: This is the syntax for class inheritance. LandingPage is a new type of object that inherits
+# everything from the PageRootObject type. With this, you can call LandingPage.highlight() which
+# is a method defined in PageRootObject.
 class LandingPage(PageRootObject):
    """
    Interface for working with the wikipedia.org landing page. This page has links to
    select a language and go to the respective wikipedia root page.
    """

-    # Note: This is the LandingPage() object constructor. All it does right now is
+    # Note: This is the LandingPage() class constructor. The constructor is a method
+    # that is executed when a new object of this class is created. All it does right now is
    # reference the parent (PageRootObject) constructor method and call it. This
-    # calls PageRootObject.__init__(driver) which makes the web driver available
+    # calls PageRootObject.__init__(driver) which then makes the web driver available
    # in the object instance.
    def __init__(self, driver=None):
        super().__init__(driver)

    def goto_landing_page(self):
+        """
+        Navigates the browser to www.wikipedia.org
+        """
        self.driver.get(config.obj.PAGE_BASE_URL)

    def select_language(self, language):
-        link = self.driver.find_element_by_partial_link_text(language)
+        lang_text = LANGUAGES.get(language)
+        try:
+            link = self.driver.find_element_by_partial_link_text(lang_text)
            self.click(link)
+            return True
+        except selenium.common.exceptions.NoSuchElementException as ex:
+            logging.warn('failed to find language: %s as %s' % (language, lang_text))
+            return False

 class MainPage(PageRootObject):
    """
@ -121,7 +131,7 @@ class ArticlePage(PageRootObject):
    # These are used to locate html elements in the web browser. There are many
    # ways to locate elements but one of the best if available is locating by id. It's
    # not enforced but the html specification mandates that element id's are unique
-    # so if you can select by id in a semanticly correct web page, you can correctly
+    # so if you can select by id in a semantically correct web page, you can correctly
    # select unique elements with high confidence.
    elements = {
        'main-window-content-text-id': 'mw-content-text',
@ -132,10 +142,18 @@ class ArticlePage(PageRootObject):
        super().__init__(driver)

    def get_title(self):
+        """
+        Returns the article title.
+        """
        heading = self.driver.find_element_by_id(ArticlePage.elements['article-title'])
        return heading.text

    def click_first_link(self):
+        """
+        Attempts to click the first valid link in the article. Some work is
+        done to skip over certain links but the implementation breaks in some
+        edge cases. It's close but not perfect for every article text.
+        """
        return self._iterate_paragraphs()

    # Note: Here this method has it's name prepended with a single underscore.
@ -159,33 +177,51 @@ class ArticlePage(PageRootObject):
        main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id'])
        paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]')
        for p in paragraphs:
+            # Return code indicates the success status of _parse_paragraph().
+            # In this case, an rc of True means that it was able to find a
+            # link and we stop going through paragraphs.
            rc = self._parse_paragraph(p)
            if rc:
                return True

    def _parse_paragraph(self, p):
+        """
+        Attempts to find a valid link in the paragraph element sent in.
+
+        Args:
+            p: Reference to selenium paragraph element. This is a paragraph
+        taken from the article.
+        """
        links = p.find_elements_by_xpath('.//a')
        if len(links) == 0:
            return False
        for link in links:
-            logging.debug('processing link: %s' % link.text)
+            log.LOGGER['pages'].debug('processing link: %s' % link.text)
            if not self._is_valid_link(p, link):
-                logging.debug('skipping link inside parenthesis: %s' % link.text)
+                log.LOGGER['pages'].debug('skipping link inside parenthesis: %s' % link.text)
                self.highlight(link, 'blue')
                continue
            self.highlight(link, 'red')
-            logging.info('selected link: %s' % link.text)
+            log.LOGGER['pages'].info('selected link: %s' % link.text)
            self.click(link)
            return True

    def _is_valid_link(self, p, el):
-        a = self._is_link_in_parenthesis(p, el)
-        b = self._is_link_a_footnote(el)
-        c = self._is_link_pronounciation(el)
-        d = self._is_link_audio(el)
-        if not a and not b and not c and not d:
-            return True
+        """
+        Returns if the implementation decides to skip this link. You can
+        see the reasons we invalidate and skip a link here. If it's
+        inside parenthesis, is a footnote, is a pronounciation guide or
+        audio link, we choose to skip it.
+        """
+        if self._is_link_in_parenthesis(p, el):
            return False
+        if self._is_link_a_footnote(el):
+            return False
+        if self._is_link_pronounciation(el):
+            return False
+        if self._is_link_audio(el):
+            return False
+        return True

    def _is_link_in_parenthesis(self, p, el):
        """
@ -198,7 +234,7 @@ class ArticlePage(PageRootObject):
        # certain links and usually avoid links inside parenthetical
        # notes. Some edge cases are nested parenthesis, links with
        # non-english characters (which are displayed with a tree
-        # of elements in the html rather than a simply link). And
+        # of elements in the html rather than a simple link). And
        # sometimes, the link inside the parenthesis may be a valid
        # target. I've made it so that skipped links show up as blue
        # and determined-valid links highlight as red.
--- a/wikicrawl/util.py
+++ b/wikicrawl/util.py
@ -0,0 +1,23 @@
+# util module contains utility functions that can be common or shared
+# between the other modules.
+
+import yandex_translate
+
+from . import config
+
+def breakpoint():
+    """
+    If DO_BREAKPOINTS is switched on, this will pause program
+    execution and wait for the user to press enter to continue.
+    """
+    if config.obj.DO_BREAKPOINTS:
+        input('BREAKPOINT hit. <Enter> to continue...')
+
+def translate_text(source_language, target_language, text):
+    translate = yandex_translate.YandexTranslate(config.obj.YANDEX_API_KEY)
+    if not source_language:
+        source_language = translate.detect(text)
+    lang_direction = '%s-%s' % (source_language, target_language)
+    result = translate.translate(text, lang_direction)
+    return result['text'][0]
+