colored logging, multiple selenium drivers, multi language support

This commit is contained in:
Mathew Guest 2017-08-25 18:09:46 -06:00
parent 5d88690ded
commit 4aa965cfc8
17 changed files with 421 additions and 113 deletions

@ -1,12 +0,0 @@
# The __init__.py file signals to the python interpreter that the
# app directory is a package. A package is a special module that
# contains other modules. Each file is a module (browser, cli, etc.)
# and the "app" package is a module that contains other modules.
# The "app" module exports the stuff exposed here. We export
# app.init() as a reference to app.config.init() and app.main
# as a reference to app.cli.main
from .config import init
from .cli import main

@ -1,7 +0,0 @@
import logging
from . import config
def init_logging():
logging.basicConfig(level=config.obj.LOG_LEVEL)

@ -1,9 +1,9 @@
import app import wikicrawl
import settings import settings
# Inject the settings.DefaultSettings object into the # Inject the settings.DefaultSettings object into the
# app and start running the program. # app and start running the program.
app.init(settings.DefaultSettings) wikicrawl.init(settings.DefaultSettings)
app.main() wikicrawl.main()
input('<enter> to exit') input('<enter> to exit')

@ -4,28 +4,107 @@
# not hard-coded into the application. For example, some users may want # not hard-coded into the application. For example, some users may want
# to run this program in English while others may want to run in Spanish. # to run this program in English while others may want to run in Spanish.
# The way this works is we specify those variables external from the # The way this works is we specify those variables external from the
# application (here) and pass them into the application (app.config module). # application (here) and pass them into the application (wikicrawl.config module).
# The application then references app.config.obj to access the variables # The application then references wikicrawl.config.obj to access the variables
# passed in from here. # passed in from here.
import colorlog
import logging import logging
import logging.config
class DefaultSettings: class DefaultSettings:
# Filepath parameters - THESE MUST EXIST OR PROGRAM WILL NOT RUN!!
LOG_FILENAME = '/tmp/wikicrawl.log'
SQLITE_DBFILE = '/home/mathew/.wikicrawler.db'
# Application Parameters # Application Parameters
LOG_LEVEL = logging.INFO DO_BREAKPOINTS = False
DO_BREAKPOINTS = True
PAGE_DELAY = 0 PAGE_DELAY = 0
# Web Driver Parameters # Web Driver Parameters
WEBDRIVER_USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)' WEBDRIVER_USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)'
WEBDRIVER_BROWSER = 'chrome' # Options are 'chrome', 'firefox'
# Requested browser and webdriver dependencies are required for this to work.
# This means you need to have installed on your system:
# Chrome + WebDriver for Chrome
# Firefox + geckodriver for Firefox
# phantomjs for phantom
WEBDRIVER_BROWSER = 'chrome' # Options are 'chrome', 'firefox', 'phantom'
# Wikipedia Parameters # Wikipedia Parameters
PAGE_BASE_URL = 'https://www.wikipedia.org/' PAGE_BASE_URL = 'https://www.wikipedia.org/'
PAGE_LANGUAGE = 'English'
# PAGE_LANGUAGE = 'Español'
# PAGE_LANGUAGE = 'Русский'
# Data Layer Parameters # Supported Languages so far:
SQLITE_DBFILE = '/home/mathew/.wikicrawler.db' # German, English, Spanish, French, Italian, Portuguese, Polish, Russian
# 'de', 'en', 'es', 'fr', 'it', 'pl', 'pt', 'ru'
PAGE_LANGUAGE = 'en'
# API Keys
YANDEX_API_KEY = 'trnsl.1.1.20170825T194642Z.26862b9dd4c1a755.9490ed28de448ff67522c2854f262eff05ec0dc3'
# Logging Parameters
LOG_SETTINGS = {
'version': 1, # version is always 1
'formatters': {
'colored': {
'()': 'colorlog.ColoredFormatter',
'format': '%(log_color)s%(levelname)-8s%(reset)s:%(log_color)s%(name)-5s%(reset)s:%(blue)s%(message)s'
},
'basic': {
'()': 'logging.Formatter',
'format': '%(levelname)s:%(name)s:%(asctime)s:%(message)s'
}
},
'handlers': {
'stderr': {
'class': 'logging.StreamHandler',
# The handler level will override the logger level if higher.
# That is, if the logger level is set to pass through DEBUG
# and higher and the handler is set to only pass through WARNING
# and higher, DEBUG messages will not pass through to this loggers
# handler. You can configure multiple handlers for any logger so
# for example you could log WARNINGS and ERRORS to a file but
# not save all the DEBUG messages.
'level': logging.DEBUG,
'formatter': 'colored'
},
'file': {
'class': 'logging.handlers.RotatingFileHandler',
'level': logging.INFO,
'formatter': 'basic',
'filename': LOG_FILENAME,
'maxBytes': 32768,
'backupCount': 3
}
},
'loggers': {
# Root Logger
'': {
'level': logging.DEBUG,
'handlers': ['file'],
},
'main': {
'level': logging.DEBUG,
'handlers': ['stderr'],
'propagate': False
},
'model': {
'level': logging.DEBUG,
'handlers': ['stderr'],
'propagate': True
},
'cli': {
'level': logging.DEBUG,
'handlers': ['stderr'],
'propagate': False
},
'pages': {
'level': logging.INFO,
'handlers': ['stderr'],
'propagate': False
}
}
}

@ -0,0 +1,23 @@
#!/usr/bin/env python
# setup.py is the install script for this application. This will download
# required third-party dependencies and package the app. You can also
# install the application system-wide.
from setuptools import setup
__project__ = 'wikicrawl'
# If you're looking for a versioning scheme, one revered pattern
# can be read about at http://semver.org
__version__ = '0.9.0'
setup(name = __project__,
version = __version__,
description = '',
author = '',
author_email = '',
url = '',
install_requires = ('yandex.translate',
'selenium',
),
packages = ('wikicrawl',))

BIN
wikicrawl/.dal.py.swp Normal file

Binary file not shown.

12
wikicrawl/__init__.py Normal file

@ -0,0 +1,12 @@
# The __init__.py file signals to the python interpreter that the
# app directory is a package. A package is a special module that
# contains other modules. Each file is a module (browser, cli, etc.)
# and the "wikicrawl" package is a module that contains other modules.
# The wikicrawl package, which is a module, exports the stuff exposed here.
# We export config.init() as a reference to wikicrawl.config.init() and
# wikicrawl.main as a reference to wikicrawl.cli.main
from .config import init
from .main import main

@ -0,0 +1,39 @@
LANGUAGES = {
'az': '',
'be': '',
'bg': '',
'ca': '',
'cs': '',
'da': '',
'de': 'Deutsch',
'el': '',
'en': 'English',
'es': 'Español',
'et': '',
'fi': '',
'fr': 'Français',
'hr': '',
'hu': '',
'hy': '',
'it': 'Italiano',
# 'ja': '日本語', -- no japanese in yandex
'lt': '',
'lv': '',
'mk': '',
'nl': '',
'no': '',
'pl': 'Polski',
'pt': 'Português',
'ro': '',
'ru': 'Русский',
'sk': '',
'sl': '',
'sq': '',
'sr': '',
'sv': '',
'tr': '',
'uk': '',
# 'zh': '中文' -- no chinese
}

@ -11,9 +11,9 @@
import selenium import selenium
import selenium.webdriver import selenium.webdriver
import logging
from . import config from . import config
from . import log
# This function has a parameter (driver) that passes in a value. In this case, # This function has a parameter (driver) that passes in a value. In this case,
# this driver variable defaults to the string 'chrome'. The code can call # this driver variable defaults to the string 'chrome'. The code can call
@ -25,9 +25,17 @@ def create_webdriver(driver='chrome'):
return create_webdriver_chrome() return create_webdriver_chrome()
elif driver == 'firefox': elif driver == 'firefox':
return create_webdriver_firefox() return create_webdriver_firefox()
elif driver == 'phantom':
return create_webdriver_phantom()
else:
log.LOGGER('browser').error('unable to handle webdriver request: %s' % driver)
return
def create_webdriver_firefox(): def create_webdriver_firefox():
pass profile = selenium.webdriver.FirefoxProfile()
profile.set_preference("general.useragent.override", config.obj.WEBDRIVER_USER_AGENT)
driver = selenium.webdriver.Firefox(profile)
return driver
def create_webdriver_chrome(): def create_webdriver_chrome():
opt = selenium.webdriver.chrome.options.Options() opt = selenium.webdriver.chrome.options.Options()
@ -35,3 +43,7 @@ def create_webdriver_chrome():
driver = selenium.webdriver.Chrome(chrome_options = opt) driver = selenium.webdriver.Chrome(chrome_options = opt)
return driver return driver
def create_webdriver_phantom():
driver = selenium.webdriver.PhantomJS()
return driver

@ -1,37 +1,42 @@
#!/usr/bin/env python # The command-line interface module creates an interface for
# The command-line interface module creates a interface for
# interacting with the python program (wikicrawl). This is an implementation # interacting with the python program (wikicrawl). This is an implementation
# of the baker demo shown previously. The user can type in commands to # of the baker demo shown previously. The user can type in commands to
# make the program do things. # make the program do things.
import baker import baker
import logging
import readline # Needed for command history <up> and <down> arrows to work
import sys import sys
if sys.platform == 'linux':
import readline # Needed for command history <up> and <down> arrows to work
from . import log
from . import model from . import model
from . import config from . import config
# Problem pages: # Problem pages:
# Decision (from politics) # Decision (from politics)
# Malaysia (goes inside parenthesis) # Malaysia (goes inside parenthesis)
# Soft-sediment_deformation_structures (doesn't find link)
# Chemicals (loops at philosophical)
commander = baker.Baker() commander = baker.Baker()
def main():
user_interface = InteractiveInterface()
if len(sys.argv) > 1: # Command line arguments were passed in
# command-line when invoking python
user_interface.run(sys.argv)
else:
user_interface.start_command_loop()
class InteractiveInterface: class InteractiveInterface:
def __init__(self): def __init__(self):
# Instantiate the variable self.model as an object
# of instance of the Model class defined in the model
# module. model.Model refers to the Model class in the
# model module and this line creates a new variable (self.model)
# which is a variable that is an instance of Model, i.e.
# it has the type Model and has Model.methods() available
# to it.
#
# self.model is a variable that is attached to the instance/object
# returned by this constructor that has the type InteractiveInterface.
self.model = model.Model() self.model = model.Model()
def run(self, args, main=True): def run_command(self, args, main=True):
""" """
Runs the command-line interface for a single command. Runs the command-line interface for a single command.
@ -45,13 +50,13 @@ class InteractiveInterface:
commander.run(argv=args, main=True, help_on_error=True, commander.run(argv=args, main=True, help_on_error=True,
instance=self) instance=self)
except baker.CommandError as ex: except baker.CommandError as ex:
logging.warn('incorrect user input: %s' % ex) log.LOGGER['cli'].warn('incorrect user input: %s' % ex)
commander.usage() commander.usage()
except baker.TopHelp as ex: except baker.TopHelp as ex:
commander.usage() commander.usage()
except Exception as ex: except Exception as ex:
logging.error('caught general exception!!') log.LOGGER['cli'].error('caught general exception!!')
print(type(ex), ex) log.LOGGER['cli'].error(type(ex), ex)
def start_command_loop(self): def start_command_loop(self):
""" """
@ -76,12 +81,13 @@ class InteractiveInterface:
# to NOT drop to a newline after printing # to NOT drop to a newline after printing
# in the terminal. Instead, let the user # in the terminal. Instead, let the user
# type their command on the same line as # type their command on the same line as
# our printed '$ '. # the printed '$ '.
try: try:
inp = input() inp = input()
except EOFError: # <ctrl>+D will send "End Line" and exit the command loop except EOFError: # <ctrl>+D will send "End Line" and exit the command loop
break break
# Note in arguments (mg):
# Note on "arguments" (mg):
# Whenever a program is run in windows or *nix, the operating # Whenever a program is run in windows or *nix, the operating
# system passes in the command string that was used to invoke # system passes in the command string that was used to invoke
# the program. You can append data in that command to configure # the program. You can append data in that command to configure
@ -91,16 +97,16 @@ class InteractiveInterface:
# software but you can also pass in an argument. You can # software but you can also pass in an argument. You can
# alternatively run "python launcher.py <argument> <argument>..." # alternatively run "python launcher.py <argument> <argument>..."
# and the operating system will provide the <argument> values into # and the operating system will provide the <argument> values into
# the process that is running. # the process that is running as variables.
# #
# In a real world use case, many commands provide switches to # In a real world use case, many commands provide switches to
# adjust what the program does. For example, # adjust what the program does. For example,
# #
# The command: # The command:
# find music -iname "*justin*bieber*" # find music -name "*justin*bieber*"
# runs the "find" program and asks to find all the filenames that match the # runs the "find" program and asks to find all the filenames that match the
# pattern *justin*bieber* in the "music" directory. # pattern *justin*bieber* in the "music" directory.
# (music, -iname, "*justin*biever*") are argument parameters # (music, -name, "*justin*biever*") are argument parameters
# that are passed into the program. The program is coded to # that are passed into the program. The program is coded to
# parse and interpret these values and execute differently based # parse and interpret these values and execute differently based
# on the values passed in. This is one way to pass in information # on the values passed in. This is one way to pass in information
@ -123,14 +129,21 @@ class InteractiveInterface:
# would be C:\Users\mguest\launcher.py. # would be C:\Users\mguest\launcher.py.
# What this method (start_command_loop()) does is provide a # What this method (start_command_loop()) does is provide a
# REPL which is a # REPL shell which is a
# read-eval-print-loop. It repeatedly asks the user for an # read-eval-print-loop. It repeatedly asks the user for an
# input (read), evaluates that input into an action (evaluate), # input (read), evaluates that input into an action (evaluate),
# give the user some feedback (print), and start the process # give the user some feedback (print), and start the process
# over again (loop). When you call "python", you are given a python # over again (loop). When you call just "python", you are loading a
# process that gives you a REPL interactive shell. The way # program that gives you a REPL interactive shell. The way
# this wikicrawl app is implemented gives the user a REPL # this wikicrawl app is implemented gives the user a REPL
# that has commands to interact with wikipedia pages. # that has commands to interact with wikipedia pages.
# Because we take in the input as a single string, we do
# a transformation to turn something like "do_random_page 5"
# into ["launcher.py", "do_random_page", "5"] which is how
# the arguments array would have been created if it were
# passed in the initial command instead of typed and interpretted
# as input as is done here.
args = [sys.argv[0], ] + inp.split() args = [sys.argv[0], ] + inp.split()
# The user can at any point in the command pass the argument # The user can at any point in the command pass the argument
@ -146,40 +159,42 @@ class InteractiveInterface:
# python launcher.py do_random_page --help # python launcher.py do_random_page --help
# You will see the program spit out the heredoc below the # You will see the program spit out the heredoc below the
# do_random_page method defined below. # do_random_page method defined below.
if '--help' in args: if '--help' in args:
args.remove('--help') args.remove('--help')
try: try:
print('command usage:') print('command usage:')
commander.usage(args[1]) commander.usage(args[1])
return
except Exception as ex: except Exception as ex:
print(type(ex), ex) print(type(ex), ex)
continue continue
self.run(args, main=False) self.run_command(args, main=False)
@commander.command @commander.command
def do_random_page(self): def play_random_page(self):
""" """
Instructs the wikicrawl application to play the game on a random Instructs the wikicrawl application to play the game on a random
article. article.
""" """
self.model.do_random_page() self.model.play_random_page()
@commander.command @commander.command
def do_n_pages(self, n): def play_multiple(self, n):
""" """
Plays the wikicrawl game <n>-times. Plays the wikicrawl game <n>-times.
""" """
try: try:
n = int(n) n = int(n)
except ValueError as ex: except ValueError as ex:
logging.warn('failed to process "%s" as a parameter' % n) log.LOGGER['cli'].warn('failed to process "%s" as a parameter' % n)
return False return False
for i in range(n): for i in range(n):
self.model.do_random_page() self.model.play_random_page()
if __name__ == '__main__': @commander.command
main() def exit(self):
"""
Immediately exit the program.
"""
sys.exit(0)

33
wikicrawl/log.py Normal file

@ -0,0 +1,33 @@
# log module is a wrapper around third-party colorlog library
# and provides an application-level interface to a logging system.
import colorlog
import logging
from . import config
# Default python log severity levels:
# CRITICAL
# ERROR
# WARNING
# INFO
# DEBUG
LOGGER = None
class LoggingLayer:
def __init__(self, config):
self.loggers = {}
logging.config.dictConfig(config)
def __getitem__(self, k):
logger = self.loggers.get(k)
if not logger:
logger = logging.getLogger(k)
self.loggers[k] = logger
return logger
def init_logging():
global LOGGER
LOGGER = LoggingLayer(config.obj.LOG_SETTINGS)

19
wikicrawl/main.py Normal file

@ -0,0 +1,19 @@
#!/usr/bin/env python
import sys
from . import cli
from . import util
def main():
user_interface = cli.InteractiveInterface()
if len(sys.argv) > 1: # Command line arguments were passed in
# command-line when invoking python
user_interface.run_command(sys.argv)
else:
user_interface.start_command_loop()
if __name__ == '__main__':
main()

@ -6,7 +6,6 @@
# to implement the wiki crawl. This is a separation of concerns # to implement the wiki crawl. This is a separation of concerns
# and keeps the logic organized and separated. # and keeps the logic organized and separated.
import logging
import os import os
import time import time
@ -15,10 +14,12 @@ from . import config
from . import dal from . import dal
from . import log from . import log
from . import pages from . import pages
from . import util
class Model: class Model:
def __init__(self): def __init__(self):
self._webdriver = None self._webdriver = None
self._translated_philosophy = None
@property @property
def webdriver(self): def webdriver(self):
@ -37,6 +38,16 @@ class Model:
page_api.goto_landing_page() page_api.goto_landing_page()
return self._webdriver return self._webdriver
@property
def translated_philosophy(self):
# This translates 'philosophy' to the target language with only 1 api call.
if config.obj.PAGE_LANGUAGE == 'en':
self._translated_philosophy = 'philosophy'
elif not self._translated_philosophy:
text = util.translate_text('en', config.obj.PAGE_LANGUAGE, 'philosophy')
self._translated_philosophy = text
return self._translated_philosophy
def open_browser(self): def open_browser(self):
x = self.webdriver # Request the browser open immediately. x = self.webdriver # Request the browser open immediately.
# Without this, the Model object will # Without this, the Model object will
@ -48,7 +59,7 @@ class Model:
# creates it and then it is re-used later # creates it and then it is re-used later
# in the application. # in the application.
def do_random_page(self): def play_random_page(self):
""" """
Select a random page and repeatedly click the first link until Select a random page and repeatedly click the first link until
we reach the article on philosophy. Sometimes, the driver encounters we reach the article on philosophy. Sometimes, the driver encounters
@ -93,25 +104,50 @@ class Model:
# the action we're trying to invoke. # the action we're trying to invoke.
page_api.goto_random_article() page_api.goto_random_article()
# Article page # Article pages
pages_visited = [] pages_visited = []
# We just need translated_title to exist
translated_title = None
while True: while True:
page_api = pages.ArticlePage(self.webdriver) page_api = pages.ArticlePage(self.webdriver)
# Get the article title (and translate if necessary)
title = page_api.get_title() title = page_api.get_title()
logging.debug('visited page: %s' % title) if config.obj.PAGE_LANGUAGE != 'en':
translated_title = util.translate_text(config.obj.PAGE_LANGUAGE, 'en', title)
log.LOGGER['model'].info('visited page: %s (%s)' % (title, translated_title))
else:
log.LOGGER['model'].info('visited page: %s' % title)
# Check for page loops (have we already visisted this page?)
if title in pages_visited: if title in pages_visited:
logging.info('encountered loop at page = %s' % title) log.LOGGER['model'].info('encountered loop at page = %s' % title)
break break
if title == 'Philosophy':
logging.info('made it to philosophy in %s pages' % len(pages_visited)) # Check if we reached the article on philosophy
if self._is_article_on_philosophy(title, translated_title):
log.LOGGER['model'].info('made it to philosophy in %s pages' % len(pages_visited))
pages_visited.append(title) pages_visited.append(title)
break break
# Store the result of what articles have been navigated
pages_visited.append(title) pages_visited.append(title)
rc = page_api.click_first_link() rc = page_api.click_first_link()
if not rc: if not rc:
logging.warn('failure: unable to continue (perhaps no valid links?)') log.LOGGER['model'].warn('failure: unable to continue (perhaps no valid links?)')
break break
print() print()
def _is_article_on_philosophy(self, title, translated_title):
"""
Checks both the original title and the translated (to english) title to
see if they seem to be the page on philosophy.
"""
if title.lower() == self.translated_philosophy.lower():
return True
if translated_title and translated_title.lower() == 'philosophy':
return True
return False

@ -1,22 +1,16 @@
# Pages module defines classes for interacting with wikipedia pages. # pages module defines classes for interacting with wikipedia pages.
# There are separate classes defined for each page with their own # There are separate classes defined for each page with their own
# defined methods for performing certain actions. # defined methods for performing certain actions.
import logging
import re import re
import selenium import selenium
import time import time
from . import browser from . import browser
from . import config from . import config
from . import log
def breakpoint(): from . import util
""" from .assets.languages import LANGUAGES
If DO_BREAKPOINTS is switched on, this will pause program
execution and wait for the user to press enter to continue.
"""
if config.obj.DO_BREAKPOINTS:
input('Breakpoint here. <Enter> to continue...')
class PageRootObject: class PageRootObject:
""" """
@ -28,7 +22,7 @@ class PageRootObject:
In here are some re-used methods to click links and highlight In here are some re-used methods to click links and highlight
elements in the browser. elements in the browser.
""" """
def __init__(self, driver=None): def __init__(self, driver):
""" """
Object constructor for initializing the instance of this Object constructor for initializing the instance of this
class with internal variables needed. class with internal variables needed.
@ -37,9 +31,6 @@ class PageRootObject:
driver: Reference to the selenium webdriver object driver: Reference to the selenium webdriver object
that is used to interface with the web browser. that is used to interface with the web browser.
""" """
if not driver:
self.driver = browser.create_webdriver()
else:
self.driver = driver self.driver = driver
def click(self, el): def click(self, el):
@ -49,11 +40,11 @@ class PageRootObject:
Args: Args:
el: selenium element to be clicked. Typically an anchor el: selenium element to be clicked. Typically an anchor
html link in the page. html link in the webpage.
""" """
self.highlight(el, 'red') self.highlight(el, 'red')
time.sleep(config.obj.PAGE_DELAY) time.sleep(config.obj.PAGE_DELAY)
breakpoint() util.breakpoint()
el.click() el.click()
def highlight(self, el, color): def highlight(self, el, color):
@ -68,36 +59,55 @@ class PageRootObject:
color: background color to highlight. Input can be one of color: background color to highlight. Input can be one of
'red', 'blue', or hex code such as '#ffffff'. 'red', 'blue', or hex code such as '#ffffff'.
""" """
# Note: The way hex codes work is there are 1 byte (2 hex characters) # Note: The way hex codes work is there is 1 byte (2 hex characters)
# for every color. #RRGGBB for (red, green, blue). This can be thought # for every color. #RRGGBB for (red, green, blue). This can be thought
# of as an integer 0-255 for red, green, and blue in base-16 hexadecimal. # of as an integer 0-255 for red, green, and blue in base-16 hexadecimal.
# For example, #ff0000 is bright red while #002f00 is light green
# and #ffff00 is full yellow.
if color == 'red': if color == 'red':
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292' color = '#ff9292'
# js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292'
elif color == 'blue': elif color == 'blue':
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff' color = '#9292ff'
# js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
else: else:
# color = color
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
self.driver.execute_script(js, el) self.driver.execute_script(js, el)
# Note: This is the syntax for class inheritance. LandingPage is a new type of object that inherits
# everything from the PageRootObject type. With this, you can call LandingPage.highlight() which
# is a method defined in PageRootObject.
class LandingPage(PageRootObject): class LandingPage(PageRootObject):
""" """
Interface for working with the wikipedia.org landing page. This page has links to Interface for working with the wikipedia.org landing page. This page has links to
select a language and go to the respective wikipedia root page. select a language and go to the respective wikipedia root page.
""" """
# Note: This is the LandingPage() object constructor. All it does right now is # Note: This is the LandingPage() class constructor. The constructor is a method
# that is executed when a new object of this class is created. All it does right now is
# reference the parent (PageRootObject) constructor method and call it. This # reference the parent (PageRootObject) constructor method and call it. This
# calls PageRootObject.__init__(driver) which makes the web driver available # calls PageRootObject.__init__(driver) which then makes the web driver available
# in the object instance. # in the object instance.
def __init__(self, driver=None): def __init__(self, driver=None):
super().__init__(driver) super().__init__(driver)
def goto_landing_page(self): def goto_landing_page(self):
"""
Navigates the browser to www.wikipedia.org
"""
self.driver.get(config.obj.PAGE_BASE_URL) self.driver.get(config.obj.PAGE_BASE_URL)
def select_language(self, language): def select_language(self, language):
link = self.driver.find_element_by_partial_link_text(language) lang_text = LANGUAGES.get(language)
try:
link = self.driver.find_element_by_partial_link_text(lang_text)
self.click(link) self.click(link)
return True
except selenium.common.exceptions.NoSuchElementException as ex:
logging.warn('failed to find language: %s as %s' % (language, lang_text))
return False
class MainPage(PageRootObject): class MainPage(PageRootObject):
""" """
@ -121,7 +131,7 @@ class ArticlePage(PageRootObject):
# These are used to locate html elements in the web browser. There are many # These are used to locate html elements in the web browser. There are many
# ways to locate elements but one of the best if available is locating by id. It's # ways to locate elements but one of the best if available is locating by id. It's
# not enforced but the html specification mandates that element id's are unique # not enforced but the html specification mandates that element id's are unique
# so if you can select by id in a semanticly correct web page, you can correctly # so if you can select by id in a semantically correct web page, you can correctly
# select unique elements with high confidence. # select unique elements with high confidence.
elements = { elements = {
'main-window-content-text-id': 'mw-content-text', 'main-window-content-text-id': 'mw-content-text',
@ -132,10 +142,18 @@ class ArticlePage(PageRootObject):
super().__init__(driver) super().__init__(driver)
def get_title(self): def get_title(self):
"""
Returns the article title.
"""
heading = self.driver.find_element_by_id(ArticlePage.elements['article-title']) heading = self.driver.find_element_by_id(ArticlePage.elements['article-title'])
return heading.text return heading.text
def click_first_link(self): def click_first_link(self):
"""
Attempts to click the first valid link in the article. Some work is
done to skip over certain links but the implementation breaks in some
edge cases. It's close but not perfect for every article text.
"""
return self._iterate_paragraphs() return self._iterate_paragraphs()
# Note: Here this method has it's name prepended with a single underscore. # Note: Here this method has it's name prepended with a single underscore.
@ -159,33 +177,51 @@ class ArticlePage(PageRootObject):
main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id']) main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id'])
paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]') paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]')
for p in paragraphs: for p in paragraphs:
# Return code indicates the success status of _parse_paragraph().
# In this case, an rc of True means that it was able to find a
# link and we stop going through paragraphs.
rc = self._parse_paragraph(p) rc = self._parse_paragraph(p)
if rc: if rc:
return True return True
def _parse_paragraph(self, p): def _parse_paragraph(self, p):
"""
Attempts to find a valid link in the paragraph element sent in.
Args:
p: Reference to selenium paragraph element. This is a paragraph
taken from the article.
"""
links = p.find_elements_by_xpath('.//a') links = p.find_elements_by_xpath('.//a')
if len(links) == 0: if len(links) == 0:
return False return False
for link in links: for link in links:
logging.debug('processing link: %s' % link.text) log.LOGGER['pages'].debug('processing link: %s' % link.text)
if not self._is_valid_link(p, link): if not self._is_valid_link(p, link):
logging.debug('skipping link inside parenthesis: %s' % link.text) log.LOGGER['pages'].debug('skipping link inside parenthesis: %s' % link.text)
self.highlight(link, 'blue') self.highlight(link, 'blue')
continue continue
self.highlight(link, 'red') self.highlight(link, 'red')
logging.info('selected link: %s' % link.text) log.LOGGER['pages'].info('selected link: %s' % link.text)
self.click(link) self.click(link)
return True return True
def _is_valid_link(self, p, el): def _is_valid_link(self, p, el):
a = self._is_link_in_parenthesis(p, el) """
b = self._is_link_a_footnote(el) Returns if the implementation decides to skip this link. You can
c = self._is_link_pronounciation(el) see the reasons we invalidate and skip a link here. If it's
d = self._is_link_audio(el) inside parenthesis, is a footnote, is a pronounciation guide or
if not a and not b and not c and not d: audio link, we choose to skip it.
return True """
if self._is_link_in_parenthesis(p, el):
return False return False
if self._is_link_a_footnote(el):
return False
if self._is_link_pronounciation(el):
return False
if self._is_link_audio(el):
return False
return True
def _is_link_in_parenthesis(self, p, el): def _is_link_in_parenthesis(self, p, el):
""" """
@ -198,7 +234,7 @@ class ArticlePage(PageRootObject):
# certain links and usually avoid links inside parenthetical # certain links and usually avoid links inside parenthetical
# notes. Some edge cases are nested parenthesis, links with # notes. Some edge cases are nested parenthesis, links with
# non-english characters (which are displayed with a tree # non-english characters (which are displayed with a tree
# of elements in the html rather than a simply link). And # of elements in the html rather than a simple link). And
# sometimes, the link inside the parenthesis may be a valid # sometimes, the link inside the parenthesis may be a valid
# target. I've made it so that skipped links show up as blue # target. I've made it so that skipped links show up as blue
# and determined-valid links highlight as red. # and determined-valid links highlight as red.

23
wikicrawl/util.py Normal file

@ -0,0 +1,23 @@
# util module contains utility functions that can be common or shared
# between the other modules.
import yandex_translate
from . import config
def breakpoint():
"""
If DO_BREAKPOINTS is switched on, this will pause program
execution and wait for the user to press enter to continue.
"""
if config.obj.DO_BREAKPOINTS:
input('BREAKPOINT hit. <Enter> to continue...')
def translate_text(source_language, target_language, text):
translate = yandex_translate.YandexTranslate(config.obj.YANDEX_API_KEY)
if not source_language:
source_language = translate.detect(text)
lang_direction = '%s-%s' % (source_language, target_language)
result = translate.translate(text, lang_direction)
return result['text'][0]