colored logging, multiple selenium drivers, multi language support

2025-04-19 09:59:21 -06:00 · 2017-08-25 18:09:46 -06:00 · 2017-08-25 18:09:46 -06:00 · 4aa965cfc8
commit 4aa965cfc8
parent 5d88690ded
17 changed files with 421 additions and 113 deletions
--- a/app/init.py
+++ b/app/init.py
@ -1,12 +0,0 @@
 # The __init__.py file signals to the python interpreter that the
 # app directory is a package. A package is a special module that
 # contains other modules. Each file is a module (browser, cli, etc.)
 # and the "app" package is a module that contains other modules.
 # The "app" module exports the stuff exposed here. We export
 # app.init() as a reference to app.config.init() and app.main
 # as a reference to app.cli.main
 from .config import init
 from .cli import main
--- a/app/log.py
+++ b/app/log.py
@ -1,7 +0,0 @@
 import logging
 from . import config
 def init_logging():
    logging.basicConfig(level=config.obj.LOG_LEVEL)
--- a/launcher.py
+++ b/launcher.py
@ -1,9 +1,9 @@
-import app
+import wikicrawl
 import settings
 # Inject the settings.DefaultSettings object into the
 # app and start running the program.
-app.init(settings.DefaultSettings)
+wikicrawl.init(settings.DefaultSettings)
-app.main()
+wikicrawl.main()
 input('<enter> to exit')
--- a/settings.py
+++ b/settings.py
@ -4,28 +4,107 @@
 # not hard-coded into the application. For example, some users may want
 # to run this program in English while others may want to run in Spanish.
 # The way this works is we specify those variables external from the
-# application (here) and pass them into the application (app.config module).
+# application (here) and pass them into the application (wikicrawl.config module).
-# The application then references app.config.obj to access the variables
+# The application then references wikicrawl.config.obj to access the variables
 # passed in from here.
 import colorlog
 import logging
 import logging.config
 class DefaultSettings:
    # Filepath parameters - THESE MUST EXIST OR PROGRAM WILL NOT RUN!!
    LOG_FILENAME = '/tmp/wikicrawl.log'
    SQLITE_DBFILE = '/home/mathew/.wikicrawler.db'
    # Application Parameters
-    LOG_LEVEL = logging.INFO
+    DO_BREAKPOINTS = False
    DO_BREAKPOINTS = True
    PAGE_DELAY = 0
    # Web Driver Parameters
    WEBDRIVER_USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)'
-    WEBDRIVER_BROWSER = 'chrome' # Options are 'chrome', 'firefox'
+
    # Requested browser and webdriver dependencies are required for this to work.
    # This means you need to have installed on your system:
    # Chrome + WebDriver for Chrome
    # Firefox + geckodriver for Firefox
    # phantomjs for phantom
    WEBDRIVER_BROWSER = 'chrome' # Options are 'chrome', 'firefox', 'phantom'
    # Wikipedia Parameters
    PAGE_BASE_URL = 'https://www.wikipedia.org/'
    PAGE_LANGUAGE = 'English'
    # PAGE_LANGUAGE = 'Español'
    # PAGE_LANGUAGE = 'Русский'
-    # Data Layer Parameters
+    # Supported Languages so far:
-    SQLITE_DBFILE = '/home/mathew/.wikicrawler.db'
+    # German, English, Spanish, French, Italian, Portuguese, Polish, Russian
    # 'de', 'en', 'es', 'fr', 'it', 'pl', 'pt', 'ru'
    PAGE_LANGUAGE = 'en'
    # API Keys
    YANDEX_API_KEY = 'trnsl.1.1.20170825T194642Z.26862b9dd4c1a755.9490ed28de448ff67522c2854f262eff05ec0dc3'
    # Logging Parameters
    LOG_SETTINGS = {
        'version': 1, # version is always 1
        'formatters': {
            'colored': {
                '()': 'colorlog.ColoredFormatter',
                'format': '%(log_color)s%(levelname)-8s%(reset)s:%(log_color)s%(name)-5s%(reset)s:%(blue)s%(message)s'
            },
            'basic': {
                '()': 'logging.Formatter',
                'format': '%(levelname)s:%(name)s:%(asctime)s:%(message)s'
            }
        },
        'handlers': {
            'stderr': {
                'class': 'logging.StreamHandler',
                # The handler level will override the logger level if higher.
                # That is, if the logger level is set to pass through DEBUG
                # and higher and the handler is set to only pass through WARNING
                # and higher, DEBUG messages will not pass through to this loggers
                # handler. You can configure multiple handlers for any logger so
                # for example you could log WARNINGS and ERRORS to a file but
                # not save all the DEBUG messages.
                'level': logging.DEBUG,
                'formatter': 'colored'
            },
            'file': {
                'class': 'logging.handlers.RotatingFileHandler',
                'level': logging.INFO,
                'formatter': 'basic',
                'filename': LOG_FILENAME,
                'maxBytes': 32768,
                'backupCount': 3
            }
        },
        'loggers': {
            # Root Logger
            '': {
                'level': logging.DEBUG,
                'handlers': ['file'],
            },
            'main': {
                'level': logging.DEBUG,
                'handlers': ['stderr'],
                'propagate': False
            },
            'model': {
                'level': logging.DEBUG,
                'handlers': ['stderr'],
                'propagate': True
            },
            'cli': {
                'level': logging.DEBUG,
                'handlers': ['stderr'],
                'propagate': False
            },
            'pages': {
                'level': logging.INFO,
                'handlers': ['stderr'],
                'propagate': False
            }
        }
    }
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,23 @@
 #!/usr/bin/env python
 # setup.py is the install script for this application. This will download
 # required third-party dependencies and package the app. You can also
 # install the application system-wide.
 from setuptools import setup
 __project__ = 'wikicrawl'
 # If you're looking for a versioning scheme, one revered pattern
 # can be read about at http://semver.org
 __version__ = '0.9.0'
 setup(name = __project__,
      version = __version__,
      description = '',
      author = '',
      author_email = '',
      url = '',
      install_requires = ('yandex.translate',
                          'selenium',
                          ),
      packages = ('wikicrawl',))
--- a/wikicrawl/.dal.py.swp
+++ b/wikicrawl/.dal.py.swp
--- a/wikicrawl/init.py
+++ b/wikicrawl/init.py
@ -0,0 +1,12 @@
 # The __init__.py file signals to the python interpreter that the
 # app directory is a package. A package is a special module that
 # contains other modules. Each file is a module (browser, cli, etc.)
 # and the "wikicrawl" package is a module that contains other modules.
 # The wikicrawl package, which is a module, exports the stuff exposed here.
 # We export config.init() as a reference to wikicrawl.config.init() and
 # wikicrawl.main as a reference to wikicrawl.cli.main
 from .config import init
 from .main import main
--- a/wikicrawl/assets/languages.py
+++ b/wikicrawl/assets/languages.py
@ -0,0 +1,39 @@
 LANGUAGES = {
    'az': '',
    'be': '',
    'bg': '',
    'ca': '',
    'cs': '',
    'da': '',
    'de': 'Deutsch',
    'el': '',
    'en': 'English',
    'es': 'Español',
    'et': '',
    'fi': '',
    'fr': 'Français',
    'hr': '',
    'hu': '',
    'hy': '',
    'it': 'Italiano',
    # 'ja': '日本語', -- no japanese in yandex
    'lt': '',
    'lv': '',
    'mk': '',
    'nl': '',
    'no': '',
    'pl': 'Polski',
    'pt': 'Português',
    'ro': '',
    'ru': 'Русский',
    'sk': '',
    'sl': '',
    'sq': '',
    'sr': '',
    'sv': '',
    'tr': '',
    'uk': '',
    # 'zh': '中文' -- no chinese
 }
--- a/wikicrawl/browser.py
+++ b/wikicrawl/browser.py
@ -11,9 +11,9 @@
 import selenium
 import selenium.webdriver
 import logging
 from . import config
 from . import log
 # This function has a parameter (driver) that passes in a value. In this case,
 # this driver variable defaults to the string 'chrome'. The code can call
@ -25,9 +25,17 @@ def create_webdriver(driver='chrome'):
        return create_webdriver_chrome()
    elif driver == 'firefox':
        return create_webdriver_firefox()
    elif driver == 'phantom':
        return create_webdriver_phantom()
    else:
        log.LOGGER('browser').error('unable to handle webdriver request: %s' % driver)
        return
 def create_webdriver_firefox():
-    pass
+    profile = selenium.webdriver.FirefoxProfile()
    profile.set_preference("general.useragent.override", config.obj.WEBDRIVER_USER_AGENT)
    driver = selenium.webdriver.Firefox(profile)
    return driver
 def create_webdriver_chrome():
    opt = selenium.webdriver.chrome.options.Options()
@ -35,3 +43,7 @@ def create_webdriver_chrome():
    driver = selenium.webdriver.Chrome(chrome_options = opt)
    return driver
 def create_webdriver_phantom():
    driver = selenium.webdriver.PhantomJS()
    return driver
--- a/wikicrawl/cli.py
+++ b/wikicrawl/cli.py
@ -1,37 +1,42 @@
-#!/usr/bin/env python
+# The command-line interface module creates an interface for
 # The command-line interface module creates a interface for
 # interacting with the python program (wikicrawl). This is an implementation
 # of the baker demo shown previously. The user can type in commands to
 # make the program do things.
 import baker
 import logging
 import readline # Needed for command history <up> and <down> arrows to work
 import sys
 if sys.platform == 'linux':
    import readline # Needed for command history <up> and <down> arrows to work
 from . import log
 from . import model
 from . import config
 # Problem pages:
 # Decision (from politics)
 # Malaysia (goes inside parenthesis)
 # Soft-sediment_deformation_structures (doesn't find link)
 # Chemicals (loops at philosophical)
 commander = baker.Baker()
 def main():
    user_interface = InteractiveInterface()
    if len(sys.argv) > 1: # Command line arguments were passed in
                          # command-line when invoking python
        user_interface.run(sys.argv)
    else:
        user_interface.start_command_loop()
 class InteractiveInterface:
    def __init__(self):
        # Instantiate the variable self.model as an object
        # of instance of the Model class defined in the model
        # module. model.Model refers to the Model class in the
        # model module and this line creates a new variable (self.model)
        # which is a variable that is an instance of Model, i.e.
        # it has the type Model and has Model.methods() available
        # to it.
        #
        # self.model is a variable that is attached to the instance/object
        # returned by this constructor that has the type InteractiveInterface.
        self.model = model.Model()
-    def run(self, args, main=True):
+    def run_command(self, args, main=True):
        """
        Runs the command-line interface for a single command.
@ -45,13 +50,13 @@ class InteractiveInterface:
            commander.run(argv=args, main=True, help_on_error=True,
                          instance=self)
        except baker.CommandError as ex:
-            logging.warn('incorrect user input: %s' % ex)
+            log.LOGGER['cli'].warn('incorrect user input: %s' % ex)
            commander.usage()
        except baker.TopHelp as ex:
            commander.usage()
        except Exception as ex:
-            logging.error('caught general exception!!')
+            log.LOGGER['cli'].error('caught general exception!!')
-            print(type(ex), ex)
+            log.LOGGER['cli'].error(type(ex), ex)
    def start_command_loop(self):
        """
@ -76,12 +81,13 @@ class InteractiveInterface:
                                  # to NOT drop to a newline after printing
                                  # in the terminal. Instead, let the user
                                  # type their command on the same line as
-                                  # our printed '$ '.
+                                  # the printed '$ '.
            try:
                inp = input()
            except EOFError: # <ctrl>+D will send "End Line" and exit the command loop
                break
-            # Note in arguments (mg):
+
            # Note on "arguments" (mg):
            # Whenever a program is run in windows or *nix, the operating
            # system passes in the command string that was used to invoke
            # the program. You can append data in that command to configure
@ -91,16 +97,16 @@ class InteractiveInterface:
            # software but you can also pass in an argument. You can
            # alternatively run "python launcher.py <argument> <argument>..."
            # and the operating system will provide the <argument> values into
-            # the process that is running.
+            # the process that is running as variables.
            #
            # In a real world use case, many commands provide switches to
            # adjust what the program does. For example,
            #
            # The command:
-            #     find music -iname "*justin*bieber*"
+            #     find music -name "*justin*bieber*"
            # runs the "find" program and asks to find all the filenames that match the
            # pattern *justin*bieber* in the "music" directory.
-            # (music, -iname, "*justin*biever*") are argument parameters
+            # (music, -name, "*justin*biever*") are argument parameters
            # that are passed into the program. The program is coded to
            # parse and interpret these values and execute differently based
            # on the values passed in. This is one way to pass in information
@ -123,14 +129,21 @@ class InteractiveInterface:
            # would be C:\Users\mguest\launcher.py.
            # What this method (start_command_loop()) does is provide a
-            # REPL which is a
+            # REPL shell which is a
            # read-eval-print-loop. It repeatedly asks the user for an
            # input (read), evaluates that input into an action (evaluate),
            # give the user some feedback (print), and start the process
-            # over again (loop). When you call "python", you are given a python
+            # over again (loop). When you call just "python", you are loading a
-            # process that gives you a REPL interactive shell. The way
+            # program that gives you a REPL interactive shell. The way
            # this wikicrawl app is implemented gives the user a REPL
            # that has commands to interact with wikipedia pages.
            # Because we take in the input as a single string, we do
            # a transformation to turn something like "do_random_page 5"
            # into ["launcher.py", "do_random_page", "5"] which is how
            # the arguments array would have been created if it were
            # passed in the initial command instead of typed and interpretted
            # as input as is done here.
            args = [sys.argv[0], ] + inp.split()
            # The user can at any point in the command pass the argument
@ -146,40 +159,42 @@ class InteractiveInterface:
            #     python launcher.py do_random_page --help
            # You will see the program spit out the heredoc below the
            # do_random_page method defined below.
            if '--help' in args:
                args.remove('--help')
                try:
                    print('command usage:')
                    commander.usage(args[1])
                    return
                except Exception as ex:
                    print(type(ex), ex)
                continue
-            self.run(args, main=False)
+            self.run_command(args, main=False)
    @commander.command
-    def do_random_page(self):
+    def play_random_page(self):
        """
        Instructs the wikicrawl application to play the game on a random
        article.
        """
-        self.model.do_random_page()
+        self.model.play_random_page()
    @commander.command
-    def do_n_pages(self, n):
+    def play_multiple(self, n):
        """
        Plays the wikicrawl game <n>-times.
        """
        try:
            n = int(n)
        except ValueError as ex:
-            logging.warn('failed to process "%s" as a parameter' % n)
+            log.LOGGER['cli'].warn('failed to process "%s" as a parameter' % n)
            return False
        for i in range(n):
-            self.model.do_random_page()
+            self.model.play_random_page()
-if __name__ == '__main__':
+    @commander.command
-    main()
+    def exit(self):
        """
        Immediately exit the program.
        """
        sys.exit(0)
--- a/wikicrawl/config.py
+++ b/wikicrawl/config.py
--- a/wikicrawl/dal.py
+++ b/wikicrawl/dal.py
--- a/wikicrawl/log.py
+++ b/wikicrawl/log.py
@ -0,0 +1,33 @@
 # log module is a wrapper around third-party colorlog library
 # and provides an application-level interface to a logging system.
 import colorlog
 import logging
 from . import config
 # Default python log severity levels:
 # CRITICAL
 # ERROR
 # WARNING
 # INFO
 # DEBUG
 LOGGER = None
 class LoggingLayer:
    def __init__(self, config):
        self.loggers = {}
        logging.config.dictConfig(config)
    def __getitem__(self, k):
        logger = self.loggers.get(k)
        if not logger:
            logger = logging.getLogger(k)
            self.loggers[k] = logger
        return logger
 def init_logging():
    global LOGGER
    LOGGER = LoggingLayer(config.obj.LOG_SETTINGS)
--- a/wikicrawl/main.py
+++ b/wikicrawl/main.py
@ -0,0 +1,19 @@
 #!/usr/bin/env python
 import sys
 from . import cli
 from . import util
 def main():
    user_interface = cli.InteractiveInterface()
    if len(sys.argv) > 1: # Command line arguments were passed in
                          # command-line when invoking python
        user_interface.run_command(sys.argv)
    else:
        user_interface.start_command_loop()
 if __name__ == '__main__':
    main()
--- a/wikicrawl/model.py
+++ b/wikicrawl/model.py
@ -6,7 +6,6 @@
 # to implement the wiki crawl. This is a separation of concerns
 # and keeps the logic organized and separated.
 import logging
 import os
 import time
@ -15,10 +14,12 @@ from . import config
 from . import dal
 from . import log
 from . import pages
 from . import util
 class Model:
    def __init__(self):
        self._webdriver = None
        self._translated_philosophy = None
    @property
    def webdriver(self):
@ -37,6 +38,16 @@ class Model:
            page_api.goto_landing_page()
        return self._webdriver
    @property
    def translated_philosophy(self):
        # This translates 'philosophy' to the target language with only 1 api call.
        if config.obj.PAGE_LANGUAGE == 'en':
            self._translated_philosophy = 'philosophy'
        elif not self._translated_philosophy:
            text = util.translate_text('en', config.obj.PAGE_LANGUAGE, 'philosophy')
            self._translated_philosophy = text
        return self._translated_philosophy
    def open_browser(self):
        x = self.webdriver # Request the browser open immediately.
                           # Without this, the Model object will
@ -48,7 +59,7 @@ class Model:
                           # creates it and then it is re-used later
                           # in the application.
-    def do_random_page(self):
+    def play_random_page(self):
        """
        Select a random page and repeatedly click the first link until
        we reach the article on philosophy. Sometimes, the driver encounters
@ -93,25 +104,50 @@ class Model:
        # the action we're trying to invoke.
        page_api.goto_random_article()
-        # Article page
+        # Article pages
        pages_visited = []
        # We just need translated_title to exist
        translated_title = None
        while True:
            page_api = pages.ArticlePage(self.webdriver)
            # Get the article title (and translate if necessary)
            title = page_api.get_title()
-            logging.debug('visited page: %s' % title)
+            if config.obj.PAGE_LANGUAGE != 'en':
                translated_title = util.translate_text(config.obj.PAGE_LANGUAGE, 'en', title)
                log.LOGGER['model'].info('visited page: %s (%s)' % (title, translated_title))
            else:
                log.LOGGER['model'].info('visited page: %s' % title)
            # Check for page loops (have we already visisted this page?)
            if title in pages_visited:
-                logging.info('encountered loop at page = %s' % title)
+                log.LOGGER['model'].info('encountered loop at page = %s' % title)
                break
-            if title == 'Philosophy':
+
-                logging.info('made it to philosophy in %s pages' % len(pages_visited))
+            # Check if we reached the article on philosophy
            if self._is_article_on_philosophy(title, translated_title):
                log.LOGGER['model'].info('made it to philosophy in %s pages' % len(pages_visited))
                pages_visited.append(title)
                break
            # Store the result of what articles have been navigated
            pages_visited.append(title)
            rc = page_api.click_first_link()
            if not rc:
-                logging.warn('failure: unable to continue (perhaps no valid links?)')
+                log.LOGGER['model'].warn('failure: unable to continue (perhaps no valid links?)')
                break
            print()
    def _is_article_on_philosophy(self, title, translated_title):
        """
        Checks both the original title and the translated (to english) title to
        see if they seem to be the page on philosophy.
        """
        if title.lower() == self.translated_philosophy.lower():
            return True
        if translated_title and translated_title.lower() == 'philosophy':
            return True
        return False
--- a/wikicrawl/pages.py
+++ b/wikicrawl/pages.py
@ -1,22 +1,16 @@
-# Pages module defines classes for interacting with wikipedia pages.
+# pages module defines classes for interacting with wikipedia pages.
 # There are separate classes defined for each page with their own
 # defined methods for performing certain actions.
 import logging
 import re
 import selenium
 import time
 from . import browser
 from . import config
-
+from . import log
-def breakpoint():
+from . import util
-    """
+from .assets.languages import LANGUAGES
    If DO_BREAKPOINTS is switched on, this will pause program
    execution and wait for the user to press enter to continue.
    """
    if config.obj.DO_BREAKPOINTS:
        input('Breakpoint here. <Enter> to continue...')
 class PageRootObject:
    """
@ -28,7 +22,7 @@ class PageRootObject:
    In here are some re-used methods to click links and highlight
    elements in the browser.
    """
-    def __init__(self, driver=None):
+    def __init__(self, driver):
        """
        Object constructor for initializing the instance of this
        class with internal variables needed.
@ -37,9 +31,6 @@ class PageRootObject:
            driver: Reference to the selenium webdriver object
        that is used to interface with the web browser.
        """
        if not driver:
            self.driver = browser.create_webdriver()
        else:
        self.driver = driver
    def click(self, el): 
@ -49,11 +40,11 @@ class PageRootObject:
        Args:
            el: selenium element to be clicked. Typically an anchor
-        html link in the page.
+        html link in the webpage.
        """
        self.highlight(el, 'red')
        time.sleep(config.obj.PAGE_DELAY)
-        breakpoint()
+        util.breakpoint()
        el.click()
    def highlight(self, el, color):
@ -68,36 +59,55 @@ class PageRootObject:
            color: background color to highlight. Input can be one of
        'red', 'blue', or hex code such as '#ffffff'.
        """
-        # Note: The way hex codes work is there are 1 byte (2 hex characters)
+        # Note: The way hex codes work is there is 1 byte (2 hex characters)
        # for every color. #RRGGBB for (red, green, blue). This can be thought
        # of as an integer 0-255 for red, green, and blue in base-16 hexadecimal.
        # For example, #ff0000 is bright red while #002f00 is light green
        # and #ffff00 is full yellow.
        if color == 'red':
-            js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292'
+            color = '#ff9292'
            # js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292'
        elif color == 'blue':
-            js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
+            color = '#9292ff'
            # js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
        else:
            # color = color
            js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
        js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
        self.driver.execute_script(js, el)
 # Note: This is the syntax for class inheritance. LandingPage is a new type of object that inherits
 # everything from the PageRootObject type. With this, you can call LandingPage.highlight() which
 # is a method defined in PageRootObject.
 class LandingPage(PageRootObject):
    """
    Interface for working with the wikipedia.org landing page. This page has links to
    select a language and go to the respective wikipedia root page.
    """
-    # Note: This is the LandingPage() object constructor. All it does right now is
+    # Note: This is the LandingPage() class constructor. The constructor is a method
    # that is executed when a new object of this class is created. All it does right now is
    # reference the parent (PageRootObject) constructor method and call it. This
-    # calls PageRootObject.__init__(driver) which makes the web driver available
+    # calls PageRootObject.__init__(driver) which then makes the web driver available
    # in the object instance.
    def __init__(self, driver=None):
        super().__init__(driver)
    def goto_landing_page(self):
        """
        Navigates the browser to www.wikipedia.org
        """
        self.driver.get(config.obj.PAGE_BASE_URL)
    def select_language(self, language):
-        link = self.driver.find_element_by_partial_link_text(language)
+        lang_text = LANGUAGES.get(language)
        try:
            link = self.driver.find_element_by_partial_link_text(lang_text)
            self.click(link)
            return True
        except selenium.common.exceptions.NoSuchElementException as ex:
            logging.warn('failed to find language: %s as %s' % (language, lang_text))
            return False
 class MainPage(PageRootObject):
    """
@ -121,7 +131,7 @@ class ArticlePage(PageRootObject):
    # These are used to locate html elements in the web browser. There are many
    # ways to locate elements but one of the best if available is locating by id. It's
    # not enforced but the html specification mandates that element id's are unique
-    # so if you can select by id in a semanticly correct web page, you can correctly
+    # so if you can select by id in a semantically correct web page, you can correctly
    # select unique elements with high confidence.
    elements = {
        'main-window-content-text-id': 'mw-content-text',
@ -132,10 +142,18 @@ class ArticlePage(PageRootObject):
        super().__init__(driver)
    def get_title(self):
        """
        Returns the article title.
        """
        heading = self.driver.find_element_by_id(ArticlePage.elements['article-title'])
        return heading.text
    def click_first_link(self):
        """
        Attempts to click the first valid link in the article. Some work is
        done to skip over certain links but the implementation breaks in some
        edge cases. It's close but not perfect for every article text.
        """
        return self._iterate_paragraphs()
    # Note: Here this method has it's name prepended with a single underscore.
@ -159,33 +177,51 @@ class ArticlePage(PageRootObject):
        main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id'])
        paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]')
        for p in paragraphs:
            # Return code indicates the success status of _parse_paragraph().
            # In this case, an rc of True means that it was able to find a
            # link and we stop going through paragraphs.
            rc = self._parse_paragraph(p)
            if rc:
                return True
    def _parse_paragraph(self, p):
        """
        Attempts to find a valid link in the paragraph element sent in.
        Args:
            p: Reference to selenium paragraph element. This is a paragraph
        taken from the article.
        """
        links = p.find_elements_by_xpath('.//a')
        if len(links) == 0:
            return False
        for link in links:
-            logging.debug('processing link: %s' % link.text)
+            log.LOGGER['pages'].debug('processing link: %s' % link.text)
            if not self._is_valid_link(p, link):
-                logging.debug('skipping link inside parenthesis: %s' % link.text)
+                log.LOGGER['pages'].debug('skipping link inside parenthesis: %s' % link.text)
                self.highlight(link, 'blue')
                continue
            self.highlight(link, 'red')
-            logging.info('selected link: %s' % link.text)
+            log.LOGGER['pages'].info('selected link: %s' % link.text)
            self.click(link)
            return True
    def _is_valid_link(self, p, el):
-        a = self._is_link_in_parenthesis(p, el)
+        """
-        b = self._is_link_a_footnote(el)
+        Returns if the implementation decides to skip this link. You can
-        c = self._is_link_pronounciation(el)
+        see the reasons we invalidate and skip a link here. If it's
-        d = self._is_link_audio(el)
+        inside parenthesis, is a footnote, is a pronounciation guide or
-        if not a and not b and not c and not d:
+        audio link, we choose to skip it.
-            return True
+        """
        if self._is_link_in_parenthesis(p, el):
            return False
        if self._is_link_a_footnote(el):
            return False
        if self._is_link_pronounciation(el):
            return False
        if self._is_link_audio(el):
            return False
        return True
    def _is_link_in_parenthesis(self, p, el):
        """
@ -198,7 +234,7 @@ class ArticlePage(PageRootObject):
        # certain links and usually avoid links inside parenthetical
        # notes. Some edge cases are nested parenthesis, links with
        # non-english characters (which are displayed with a tree
-        # of elements in the html rather than a simply link). And
+        # of elements in the html rather than a simple link). And
        # sometimes, the link inside the parenthesis may be a valid
        # target. I've made it so that skipped links show up as blue
        # and determined-valid links highlight as red.
--- a/wikicrawl/util.py
+++ b/wikicrawl/util.py
@ -0,0 +1,23 @@
 # util module contains utility functions that can be common or shared
 # between the other modules.
 import yandex_translate
 from . import config
 def breakpoint():
    """
    If DO_BREAKPOINTS is switched on, this will pause program
    execution and wait for the user to press enter to continue.
    """
    if config.obj.DO_BREAKPOINTS:
        input('BREAKPOINT hit. <Enter> to continue...')
 def translate_text(source_language, target_language, text):
    translate = yandex_translate.YandexTranslate(config.obj.YANDEX_API_KEY)
    if not source_language:
        source_language = translate.detect(text)
    lang_direction = '%s-%s' % (source_language, target_language)
    result = translate.translate(text, lang_direction)
    return result['text'][0]