mirror of
https://git.zavage.net/Zavage-Software/wikicrawl.git
synced 2024-11-22 00:00:25 -07:00
lots of documentation and safer functionality
This commit is contained in:
parent
64093c58a2
commit
5d88690ded
@ -1,4 +1 @@
|
|||||||
You need selenium-server installed and running:
|
|
||||||
|
|
||||||
java -jar /usr/share/selenium-server/selenium-server-standalone.jar -timeout 0
|
|
||||||
|
|
||||||
|
@ -1 +1,12 @@
|
|||||||
|
# The __init__.py file signals to the python interpreter that the
|
||||||
|
# app directory is a package. A package is a special module that
|
||||||
|
# contains other modules. Each file is a module (browser, cli, etc.)
|
||||||
|
# and the "app" package is a module that contains other modules.
|
||||||
|
|
||||||
|
# The "app" module exports the stuff exposed here. We export
|
||||||
|
# app.init() as a reference to app.config.init() and app.main
|
||||||
|
# as a reference to app.cli.main
|
||||||
|
|
||||||
|
from .config import init
|
||||||
|
from .cli import main
|
||||||
|
|
||||||
|
@ -1,13 +1,25 @@
|
|||||||
|
# browser module defines functions for creating selenium webdriver
|
||||||
|
# objects. The way this works is selenium is a third-party library
|
||||||
|
# that gives a common interface for interacting with web browsers,
|
||||||
|
# i.e. chrome, firefox, internet explorer, and even a pseudo-browser.
|
||||||
|
#
|
||||||
|
# This library (selenium) creates a web browser process
|
||||||
|
# (chrome will actually fire up for you to see) and gives you
|
||||||
|
# a webdriver object interface to programmatically control the browser.
|
||||||
|
# You can do things like click on links, extract information from the
|
||||||
|
# page, pass control to a user... the limit is your imagination.
|
||||||
|
|
||||||
import selenium
|
import selenium
|
||||||
import selenium.webdriver
|
import selenium.webdriver
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
settings = {}
|
from . import config
|
||||||
|
|
||||||
def init(settings_obj):
|
|
||||||
global settings
|
|
||||||
settings = settings_obj
|
|
||||||
|
|
||||||
|
# This function has a parameter (driver) that passes in a value. In this case,
|
||||||
|
# this driver variable defaults to the string 'chrome'. The code can call
|
||||||
|
# create_webdriver() which is the same as create_webdriver('chrome') but
|
||||||
|
# can alternatively call create_webdriver('firefox') and get different
|
||||||
|
# functionality.
|
||||||
def create_webdriver(driver='chrome'):
|
def create_webdriver(driver='chrome'):
|
||||||
if driver == 'chrome':
|
if driver == 'chrome':
|
||||||
return create_webdriver_chrome()
|
return create_webdriver_chrome()
|
||||||
@ -19,9 +31,7 @@ def create_webdriver_firefox():
|
|||||||
|
|
||||||
def create_webdriver_chrome():
|
def create_webdriver_chrome():
|
||||||
opt = selenium.webdriver.chrome.options.Options()
|
opt = selenium.webdriver.chrome.options.Options()
|
||||||
opt.add_argument('--user-agent=' + settings.WEBDRIVER_USER_AGENT)
|
opt.add_argument('--user-agent=' + config.obj.WEBDRIVER_USER_AGENT)
|
||||||
opt.add_argument('--kiosk-printing')
|
|
||||||
opt.add_argument("--focus-existing-tab-on-open=false")
|
|
||||||
driver = selenium.webdriver.Chrome(chrome_options = opt)
|
driver = selenium.webdriver.Chrome(chrome_options = opt)
|
||||||
return driver
|
return driver
|
||||||
|
|
||||||
|
118
app/cli.py
118
app/cli.py
@ -1,4 +1,8 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# The command-line interface module creates a interface for
|
||||||
|
# interacting with the python program (wikicrawl). This is an implementation
|
||||||
|
# of the baker demo shown previously. The user can type in commands to
|
||||||
|
# make the program do things.
|
||||||
|
|
||||||
import baker
|
import baker
|
||||||
import logging
|
import logging
|
||||||
@ -6,18 +10,13 @@ import readline # Needed for command history <up> and <down> arrows to work
|
|||||||
import sys
|
import sys
|
||||||
|
|
||||||
from . import model
|
from . import model
|
||||||
|
from . import config
|
||||||
|
|
||||||
# Problem pages:
|
# Problem pages:
|
||||||
# Decision (from politics)
|
# Decision (from politics)
|
||||||
# Malaysia (goes inside parenthesis)
|
# Malaysia (goes inside parenthesis)
|
||||||
|
|
||||||
commander = baker.Baker()
|
commander = baker.Baker()
|
||||||
settings = {}
|
|
||||||
|
|
||||||
def init(settings_obj):
|
|
||||||
global settings
|
|
||||||
settings = settings_obj
|
|
||||||
model.init(settings_obj)
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
user_interface = InteractiveInterface()
|
user_interface = InteractiveInterface()
|
||||||
@ -31,9 +30,17 @@ def main():
|
|||||||
class InteractiveInterface:
|
class InteractiveInterface:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.model = model.Model()
|
self.model = model.Model()
|
||||||
x = self.model.webdriver # Request the browser open immediately
|
|
||||||
|
|
||||||
def run(self, args, main=True):
|
def run(self, args, main=True):
|
||||||
|
"""
|
||||||
|
Runs the command-line interface for a single command.
|
||||||
|
|
||||||
|
If called by InteractiveInterface.run(sys.argv), this method
|
||||||
|
will execute the commands and arguments specified on command
|
||||||
|
line when running this program. Alternatively, the code could
|
||||||
|
pass in a different set of arguments to specify what to do.
|
||||||
|
See start_command_loop() for more information.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
commander.run(argv=args, main=True, help_on_error=True,
|
commander.run(argv=args, main=True, help_on_error=True,
|
||||||
instance=self)
|
instance=self)
|
||||||
@ -49,20 +56,103 @@ class InteractiveInterface:
|
|||||||
def start_command_loop(self):
|
def start_command_loop(self):
|
||||||
"""
|
"""
|
||||||
Repeatedly asks the user what command to run until they exit.
|
Repeatedly asks the user what command to run until they exit.
|
||||||
|
|
||||||
|
This method calls InteractiveInterface.run(args) a little bit
|
||||||
|
differently. Instead of passing the arguments from the command-line
|
||||||
|
that were passed in when invoking the python wikicrawl app,
|
||||||
|
this asks the user for a line of textual input and passes
|
||||||
|
those strings to run() as the arguments. This way, the user can
|
||||||
|
access an interactive shell and repeatedly issue different
|
||||||
|
commands while the application is running.
|
||||||
"""
|
"""
|
||||||
commander.usage()
|
commander.usage()
|
||||||
|
self.model.open_browser()
|
||||||
while True:
|
while True:
|
||||||
print('$ ', end = '') # Display to the user a command prompt
|
print('$ ', end = '') # Display to the user a command prompt
|
||||||
|
# The dollar-sign is a common indication
|
||||||
|
# of a shell that communicates to the user
|
||||||
|
# that we are waiting for their textual
|
||||||
|
# input. The end = '' indicates to python
|
||||||
|
# to NOT drop to a newline after printing
|
||||||
|
# in the terminal. Instead, let the user
|
||||||
|
# type their command on the same line as
|
||||||
|
# our printed '$ '.
|
||||||
try:
|
try:
|
||||||
inp = input()
|
inp = input()
|
||||||
except EOFError: # <ctrl>+D will send "End Line" and exit the command loop
|
except EOFError: # <ctrl>+D will send "End Line" and exit the command loop
|
||||||
break
|
break
|
||||||
args = ['', ] + inp.split()
|
# Note in arguments (mg):
|
||||||
|
# Whenever a program is run in windows or *nix, the operating
|
||||||
|
# system passes in the command string that was used to invoke
|
||||||
|
# the program. You can append data in that command to configure
|
||||||
|
# switches or values going into the program on the fly. For
|
||||||
|
# example, you can invoke this wikicrawl app in more than one
|
||||||
|
# way. You can of course run "python launcher.py" to run the
|
||||||
|
# software but you can also pass in an argument. You can
|
||||||
|
# alternatively run "python launcher.py <argument> <argument>..."
|
||||||
|
# and the operating system will provide the <argument> values into
|
||||||
|
# the process that is running.
|
||||||
|
#
|
||||||
|
# In a real world use case, many commands provide switches to
|
||||||
|
# adjust what the program does. For example,
|
||||||
|
#
|
||||||
|
# The command:
|
||||||
|
# find music -iname "*justin*bieber*"
|
||||||
|
# runs the "find" program and asks to find all the filenames that match the
|
||||||
|
# pattern *justin*bieber* in the "music" directory.
|
||||||
|
# (music, -iname, "*justin*biever*") are argument parameters
|
||||||
|
# that are passed into the program. The program is coded to
|
||||||
|
# parse and interpret these values and execute differently based
|
||||||
|
# on the values passed in. This is one way to pass in information
|
||||||
|
# into a running program. Some other ways are to read from a file
|
||||||
|
# (such as how we read from settings.py to load the runtime
|
||||||
|
# configuration), from something called environment variables
|
||||||
|
# (won't get into but another set of values provided to programs
|
||||||
|
# from the operating system), or they can be hard-coded into
|
||||||
|
# the application.
|
||||||
|
#
|
||||||
|
# Side note: arguments are not unique to python (almost all
|
||||||
|
# programming languages implement arguments), the functionality
|
||||||
|
# is defined by the application (some programs require arguments,
|
||||||
|
# some are optional, and the syntax for sending in argument
|
||||||
|
# parameters are different and defined by the individual programs,
|
||||||
|
# and lastly, the first argument sent in is the script name or
|
||||||
|
# filename of the script. In our case, the first argument is
|
||||||
|
# the string "launcher.py". If the user invoked the command
|
||||||
|
# as C:\Users\mguest\launcher.py then the first argument
|
||||||
|
# would be C:\Users\mguest\launcher.py.
|
||||||
|
|
||||||
if "--help" in args:
|
# What this method (start_command_loop()) does is provide a
|
||||||
args.remove("--help")
|
# REPL which is a
|
||||||
|
# read-eval-print-loop. It repeatedly asks the user for an
|
||||||
|
# input (read), evaluates that input into an action (evaluate),
|
||||||
|
# give the user some feedback (print), and start the process
|
||||||
|
# over again (loop). When you call "python", you are given a python
|
||||||
|
# process that gives you a REPL interactive shell. The way
|
||||||
|
# this wikicrawl app is implemented gives the user a REPL
|
||||||
|
# that has commands to interact with wikipedia pages.
|
||||||
|
args = [sys.argv[0], ] + inp.split()
|
||||||
|
|
||||||
|
# The user can at any point in the command pass the argument
|
||||||
|
# switch "--help". If doing this, the command line interface
|
||||||
|
# will instead print out the inline documentation associated
|
||||||
|
# with this command and quit after doing so. For example,
|
||||||
|
# the user can type "python launcher.py do_random_page --help"
|
||||||
|
# and the program will spit out the generated documentation
|
||||||
|
# for the do_random_page command and run nothing. In our case,
|
||||||
|
# this documentation is created by the baker library and will
|
||||||
|
# print out the docstring associated with the method. Try it
|
||||||
|
# out in your shell (cmd.exe or powershell.exe) by invoking
|
||||||
|
# python launcher.py do_random_page --help
|
||||||
|
# You will see the program spit out the heredoc below the
|
||||||
|
# do_random_page method defined below.
|
||||||
|
|
||||||
|
if '--help' in args:
|
||||||
|
args.remove('--help')
|
||||||
try:
|
try:
|
||||||
|
print('command usage:')
|
||||||
commander.usage(args[1])
|
commander.usage(args[1])
|
||||||
|
return
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
print(type(ex), ex)
|
print(type(ex), ex)
|
||||||
continue
|
continue
|
||||||
@ -71,13 +161,21 @@ class InteractiveInterface:
|
|||||||
|
|
||||||
@commander.command
|
@commander.command
|
||||||
def do_random_page(self):
|
def do_random_page(self):
|
||||||
|
"""
|
||||||
|
Instructs the wikicrawl application to play the game on a random
|
||||||
|
article.
|
||||||
|
"""
|
||||||
self.model.do_random_page()
|
self.model.do_random_page()
|
||||||
|
|
||||||
@commander.command
|
@commander.command
|
||||||
def do_n_pages(self, n):
|
def do_n_pages(self, n):
|
||||||
|
"""
|
||||||
|
Plays the wikicrawl game <n>-times.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
n = int(n)
|
n = int(n)
|
||||||
except ValueError as ex:
|
except ValueError as ex:
|
||||||
|
logging.warn('failed to process "%s" as a parameter' % n)
|
||||||
return False
|
return False
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
self.model.do_random_page()
|
self.model.do_random_page()
|
||||||
|
13
app/config.py
Normal file
13
app/config.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
# config module defines a place to store the external configuration/settings
|
||||||
|
# and is used to provide an interface to the runtime configuration for the
|
||||||
|
# program.
|
||||||
|
|
||||||
|
from . import log
|
||||||
|
|
||||||
|
obj = {}
|
||||||
|
|
||||||
|
def init(settings_obj):
|
||||||
|
global obj
|
||||||
|
obj = settings_obj
|
||||||
|
log.init_logging()
|
||||||
|
|
@ -2,11 +2,7 @@ import sqlite3
|
|||||||
import pycurl
|
import pycurl
|
||||||
import os
|
import os
|
||||||
|
|
||||||
settings = {}
|
from . import config
|
||||||
|
|
||||||
def init(settings_obj):
|
|
||||||
global settings
|
|
||||||
settings = settings_obj
|
|
||||||
|
|
||||||
class DataLayer:
|
class DataLayer:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
10
app/log.py
10
app/log.py
@ -1,11 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
settings = {}
|
from . import config
|
||||||
|
|
||||||
def init(settings_obj):
|
|
||||||
global settings
|
|
||||||
settings = settings_obj
|
|
||||||
init_logging()
|
|
||||||
|
|
||||||
def init_logging():
|
def init_logging():
|
||||||
logging.basicConfig(level=settings.LOG_LEVEL)
|
logging.basicConfig(level=config.obj.LOG_LEVEL)
|
||||||
|
|
||||||
|
92
app/model.py
92
app/model.py
@ -1,43 +1,96 @@
|
|||||||
|
# model module contains the business logic of the program. Notice
|
||||||
|
# the command-line interface contains no business logic and only
|
||||||
|
# has functionality to call on the model. Similarly, the page
|
||||||
|
# objects define no specific functionality for doing the wiki crawl
|
||||||
|
# but only provide general utility methods that are called upon
|
||||||
|
# to implement the wiki crawl. This is a separation of concerns
|
||||||
|
# and keeps the logic organized and separated.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from . import browser
|
from . import browser
|
||||||
from . import log
|
from . import config
|
||||||
from . import dal
|
from . import dal
|
||||||
|
from . import log
|
||||||
from . import pages
|
from . import pages
|
||||||
|
|
||||||
settings = {}
|
|
||||||
|
|
||||||
def init(settings_obj):
|
|
||||||
global settings
|
|
||||||
settings = settings_obj
|
|
||||||
|
|
||||||
browser.init(settings_obj)
|
|
||||||
dal.init(settings_obj)
|
|
||||||
pages.init(settings_obj)
|
|
||||||
log.init(settings_obj)
|
|
||||||
|
|
||||||
class Model:
|
class Model:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._webdriver = None
|
self._webdriver = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def webdriver(self):
|
def webdriver(self):
|
||||||
|
# The way this works is when an object instance of class/type Model
|
||||||
|
# is called with x.webdriver, Model runs webdriver(). In our case,
|
||||||
|
# the webdriver() method checks if a private variable self._webdriver
|
||||||
|
# exists and if it isn't, asks for a new selenim object. The result
|
||||||
|
# is that this will on-demand create a browser. If one exists, it will
|
||||||
|
# use the one that exists and if one doesn't exists, it will create
|
||||||
|
# one and use that. External code can rely on self.webdriver
|
||||||
|
# always existing with or without knowing if it exists because if it
|
||||||
|
# hasn't been created yet then it will be created on-the-fly.
|
||||||
if not self._webdriver:
|
if not self._webdriver:
|
||||||
self._webdriver = browser.create_webdriver(settings.WEBDRIVER_BROWSER)
|
self._webdriver = browser.create_webdriver(config.obj.WEBDRIVER_BROWSER)
|
||||||
page_api = pages.LandingPage(self.webdriver)
|
page_api = pages.LandingPage(self.webdriver)
|
||||||
page_api.goto_landing_page()
|
page_api.goto_landing_page()
|
||||||
return self._webdriver
|
return self._webdriver
|
||||||
|
|
||||||
def do_random_page(self):
|
def open_browser(self):
|
||||||
# Landing page (select language)
|
x = self.webdriver # Request the browser open immediately.
|
||||||
page_api = pages.LandingPage(self.webdriver)
|
# Without this, the Model object will
|
||||||
page_api.goto_landing_page()
|
# be created on-demand (as defined in
|
||||||
page_api.select_language(settings.PAGE_LANGUAGE)
|
# the Model class). This means that the
|
||||||
|
# web browser will not open until
|
||||||
|
# a command is typed in. But because we
|
||||||
|
# request the webdriver right here, Model
|
||||||
|
# creates it and then it is re-used later
|
||||||
|
# in the application.
|
||||||
|
|
||||||
# Main page
|
def do_random_page(self):
|
||||||
|
"""
|
||||||
|
Select a random page and repeatedly click the first link until
|
||||||
|
we reach the article on philosophy. Sometimes, the driver encounters
|
||||||
|
a loop and will never reach the page and sometimes the parser
|
||||||
|
fails and we fail to programmatically implement what we're trying to
|
||||||
|
do correctly.
|
||||||
|
"""
|
||||||
|
# The following 3 lines include the functionality
|
||||||
|
# for the Landing page (select language).
|
||||||
|
|
||||||
|
# This line creates a new object (page_api) which is an instance
|
||||||
|
# of type class pages.LandingPage. LandingPage is a variable
|
||||||
|
# containing a class definition that is located in the pages module.
|
||||||
|
# We pass self.webdriver as arguments into the LandingPage.__init__
|
||||||
|
# constructor.
|
||||||
|
page_api = pages.LandingPage(self.webdriver)
|
||||||
|
|
||||||
|
# This line calls the page_api object's (an instance of
|
||||||
|
# pages.LandingPage type) method goto_landing_page.
|
||||||
|
page_api.goto_landing_page()
|
||||||
|
|
||||||
|
# Similarly, this line calls the select_language method
|
||||||
|
# and passes in values from our runtime configuration.
|
||||||
|
# In this case, we have made the language a parameter
|
||||||
|
# that you can pass into the program, i.e. you can run it
|
||||||
|
# for English or Spanish or Russian or what have you.
|
||||||
|
page_api.select_language(config.obj.PAGE_LANGUAGE)
|
||||||
|
|
||||||
|
# Main page: next 2 lines
|
||||||
|
|
||||||
|
# At this point, we have clicked a link and changes the page. We
|
||||||
|
# re-create our page interface as a new object which is of
|
||||||
|
# a different class and includes distinct code for working with
|
||||||
|
# the page. In this case, we delete page_api and re-create it
|
||||||
|
# as an object of type pages.MainPage. Again, we pass in
|
||||||
|
# self.webdriver as an object of the selenium webdriver interface.
|
||||||
|
# The page_api calls methods on this webdriver to make the web
|
||||||
|
# browser do various things like click links or extract text.
|
||||||
page_api = pages.MainPage(self.webdriver)
|
page_api = pages.MainPage(self.webdriver)
|
||||||
|
|
||||||
|
# We call pages.MainPage.goto_random_article() to perform
|
||||||
|
# the action we're trying to invoke.
|
||||||
page_api.goto_random_article()
|
page_api.goto_random_article()
|
||||||
|
|
||||||
# Article page
|
# Article page
|
||||||
@ -62,4 +115,3 @@ class Model:
|
|||||||
break
|
break
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
165
app/pages.py
165
app/pages.py
@ -1,50 +1,109 @@
|
|||||||
|
# Pages module defines classes for interacting with wikipedia pages.
|
||||||
|
# There are separate classes defined for each page with their own
|
||||||
|
# defined methods for performing certain actions.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import selenium
|
import selenium
|
||||||
import selenium.webdriver
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
settings = {}
|
from . import browser
|
||||||
|
from . import config
|
||||||
def init(settings_obj):
|
|
||||||
global settings
|
|
||||||
settings = settings_obj
|
|
||||||
|
|
||||||
def breakpoint():
|
def breakpoint():
|
||||||
if settings.DO_BREAKPOINTS:
|
"""
|
||||||
|
If DO_BREAKPOINTS is switched on, this will pause program
|
||||||
|
execution and wait for the user to press enter to continue.
|
||||||
|
"""
|
||||||
|
if config.obj.DO_BREAKPOINTS:
|
||||||
input('Breakpoint here. <Enter> to continue...')
|
input('Breakpoint here. <Enter> to continue...')
|
||||||
|
|
||||||
class PageRootObject:
|
class PageRootObject:
|
||||||
|
"""
|
||||||
|
Common interface methods for working with pages. The specific
|
||||||
|
page classes below inherit these methods and define additional methods
|
||||||
|
so every page has available these methods and any additional
|
||||||
|
methods they define.
|
||||||
|
|
||||||
|
In here are some re-used methods to click links and highlight
|
||||||
|
elements in the browser.
|
||||||
|
"""
|
||||||
def __init__(self, driver=None):
|
def __init__(self, driver=None):
|
||||||
|
"""
|
||||||
|
Object constructor for initializing the instance of this
|
||||||
|
class with internal variables needed.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
driver: Reference to the selenium webdriver object
|
||||||
|
that is used to interface with the web browser.
|
||||||
|
"""
|
||||||
if not driver:
|
if not driver:
|
||||||
self.driver = create_webdriver()
|
self.driver = browser.create_webdriver()
|
||||||
else:
|
else:
|
||||||
self.driver = driver
|
self.driver = driver
|
||||||
|
|
||||||
def click(self, el):
|
def click(self, el):
|
||||||
|
"""
|
||||||
|
Clicks a link in the browser and also highlights it to the
|
||||||
|
end user.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
el: selenium element to be clicked. Typically an anchor
|
||||||
|
html link in the page.
|
||||||
|
"""
|
||||||
self.highlight(el, 'red')
|
self.highlight(el, 'red')
|
||||||
time.sleep(settings.PAGE_DELAY)
|
time.sleep(config.obj.PAGE_DELAY)
|
||||||
|
breakpoint()
|
||||||
el.click()
|
el.click()
|
||||||
|
|
||||||
def highlight(self, el, color):
|
def highlight(self, el, color):
|
||||||
|
"""
|
||||||
|
Highlights an html element in the web browser by changing the
|
||||||
|
background color as well as making the text bold.
|
||||||
|
|
||||||
|
The implementation uses javascript to alter the css of the element.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
el: selenium element to be highlighted.
|
||||||
|
color: background color to highlight. Input can be one of
|
||||||
|
'red', 'blue', or hex code such as '#ffffff'.
|
||||||
|
"""
|
||||||
|
# Note: The way hex codes work is there are 1 byte (2 hex characters)
|
||||||
|
# for every color. #RRGGBB for (red, green, blue). This can be thought
|
||||||
|
# of as an integer 0-255 for red, green, and blue in base-16 hexadecimal.
|
||||||
if color == 'red':
|
if color == 'red':
|
||||||
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292'
|
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292'
|
||||||
elif color == 'blue':
|
elif color == 'blue':
|
||||||
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
|
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
|
||||||
|
else:
|
||||||
|
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
|
||||||
self.driver.execute_script(js, el)
|
self.driver.execute_script(js, el)
|
||||||
|
|
||||||
class LandingPage(PageRootObject):
|
class LandingPage(PageRootObject):
|
||||||
|
"""
|
||||||
|
Interface for working with the wikipedia.org landing page. This page has links to
|
||||||
|
select a language and go to the respective wikipedia root page.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Note: This is the LandingPage() object constructor. All it does right now is
|
||||||
|
# reference the parent (PageRootObject) constructor method and call it. This
|
||||||
|
# calls PageRootObject.__init__(driver) which makes the web driver available
|
||||||
|
# in the object instance.
|
||||||
def __init__(self, driver=None):
|
def __init__(self, driver=None):
|
||||||
super().__init__(driver)
|
super().__init__(driver)
|
||||||
|
|
||||||
def goto_landing_page(self):
|
def goto_landing_page(self):
|
||||||
self.driver.get(settings.PAGE_BASE_URL)
|
self.driver.get(config.obj.PAGE_BASE_URL)
|
||||||
|
|
||||||
def select_language(self, language):
|
def select_language(self, language):
|
||||||
link = self.driver.find_element_by_partial_link_text(language)
|
link = self.driver.find_element_by_partial_link_text(language)
|
||||||
self.click(link)
|
self.click(link)
|
||||||
|
|
||||||
class MainPage(PageRootObject):
|
class MainPage(PageRootObject):
|
||||||
|
"""
|
||||||
|
Interface for a selected language root page. This has the link to go to a random article
|
||||||
|
and has a featured article. An example url for this is https://en.wikipedia.org.
|
||||||
|
"""
|
||||||
def __init__(self, driver=None):
|
def __init__(self, driver=None):
|
||||||
super().__init__(driver)
|
super().__init__(driver)
|
||||||
|
|
||||||
@ -53,7 +112,17 @@ class MainPage(PageRootObject):
|
|||||||
self.click(link)
|
self.click(link)
|
||||||
|
|
||||||
class ArticlePage(PageRootObject):
|
class ArticlePage(PageRootObject):
|
||||||
|
"""
|
||||||
|
Interface for a wikipedia article page. Here are defined some utility methods to
|
||||||
|
try and click the first valid link and extract some information from the page.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Here are static class-scoped variables that are needed to work with the page.
|
||||||
|
# These are used to locate html elements in the web browser. There are many
|
||||||
|
# ways to locate elements but one of the best if available is locating by id. It's
|
||||||
|
# not enforced but the html specification mandates that element id's are unique
|
||||||
|
# so if you can select by id in a semanticly correct web page, you can correctly
|
||||||
|
# select unique elements with high confidence.
|
||||||
elements = {
|
elements = {
|
||||||
'main-window-content-text-id': 'mw-content-text',
|
'main-window-content-text-id': 'mw-content-text',
|
||||||
'article-title': 'firstHeading',
|
'article-title': 'firstHeading',
|
||||||
@ -69,7 +138,24 @@ class ArticlePage(PageRootObject):
|
|||||||
def click_first_link(self):
|
def click_first_link(self):
|
||||||
return self._iterate_paragraphs()
|
return self._iterate_paragraphs()
|
||||||
|
|
||||||
|
# Note: Here this method has it's name prepended with a single underscore.
|
||||||
|
# This is a convention that communicates to the developer that these methods
|
||||||
|
# are internal private methods. That means they are not meant to be exposed
|
||||||
|
# to the external interface. Python does not restrict calling these methods.
|
||||||
|
# You can still call ArticlePage._iterate_paragraphs() but the prefix
|
||||||
|
# underscore tells you that it is not intended to be exposed and may be
|
||||||
|
# unsafe to call. Depending on the implementation, it may not make sense
|
||||||
|
# to directly call this method and may result in undefined and unexpected
|
||||||
|
# behavior. _iterate_paragraphs is called internally from the exposed
|
||||||
|
# click_first_link() but is never invoked externally.
|
||||||
def _iterate_paragraphs(self):
|
def _iterate_paragraphs(self):
|
||||||
|
"""
|
||||||
|
Iterates through paragraphs in the page and attempts to find the first
|
||||||
|
valid link. Sometimes the first paragraph does not have a link so this
|
||||||
|
needs to go through a few paragraphs and it does not make sense to
|
||||||
|
operate on the entire article every time when we're just looking for
|
||||||
|
the first link, for performance optimization.
|
||||||
|
"""
|
||||||
main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id'])
|
main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id'])
|
||||||
paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]')
|
paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]')
|
||||||
for p in paragraphs:
|
for p in paragraphs:
|
||||||
@ -89,8 +175,7 @@ class ArticlePage(PageRootObject):
|
|||||||
continue
|
continue
|
||||||
self.highlight(link, 'red')
|
self.highlight(link, 'red')
|
||||||
logging.info('selected link: %s' % link.text)
|
logging.info('selected link: %s' % link.text)
|
||||||
breakpoint()
|
self.click(link)
|
||||||
link.click()
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _is_valid_link(self, p, el):
|
def _is_valid_link(self, p, el):
|
||||||
@ -98,45 +183,61 @@ class ArticlePage(PageRootObject):
|
|||||||
b = self._is_link_a_footnote(el)
|
b = self._is_link_a_footnote(el)
|
||||||
c = self._is_link_pronounciation(el)
|
c = self._is_link_pronounciation(el)
|
||||||
d = self._is_link_audio(el)
|
d = self._is_link_audio(el)
|
||||||
print(a, b, c, d)
|
|
||||||
if not a and not b and not c and not d:
|
if not a and not b and not c and not d:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _is_link_in_parenthesis(self, p, el):
|
def _is_link_in_parenthesis(self, p, el):
|
||||||
# link_text = el.text
|
"""
|
||||||
|
Determine if a given link element is inside a set
|
||||||
|
of textual parenthesis.
|
||||||
|
"""
|
||||||
|
# Implementation notes (mg):
|
||||||
|
# I've tried a few different ways to do this and it's
|
||||||
|
# hard to get it to work in every case. I want to avoid
|
||||||
|
# certain links and usually avoid links inside parenthetical
|
||||||
|
# notes. Some edge cases are nested parenthesis, links with
|
||||||
|
# non-english characters (which are displayed with a tree
|
||||||
|
# of elements in the html rather than a simply link). And
|
||||||
|
# sometimes, the link inside the parenthesis may be a valid
|
||||||
|
# target. I've made it so that skipped links show up as blue
|
||||||
|
# and determined-valid links highlight as red.
|
||||||
link_text = el.get_attribute('outerHTML')
|
link_text = el.get_attribute('outerHTML')
|
||||||
p_text = p.get_attribute('innerHTML')
|
p_text = p.get_attribute('innerHTML')
|
||||||
|
|
||||||
regex_str = '\(.*?\)'
|
regex_str = '\(.*?\)' # Regular expression to extract the
|
||||||
|
# text inside (not nested) parenthesis
|
||||||
regex = re.compile(regex_str, flags=re.UNICODE)
|
regex = re.compile(regex_str, flags=re.UNICODE)
|
||||||
match = regex.search(p_text)
|
match = regex.search(p_text)
|
||||||
if not match:
|
if not match:
|
||||||
|
# There are no parenthesis at all in this paragraph.
|
||||||
return False
|
return False
|
||||||
|
|
||||||
while match is not None:
|
while match is not None:
|
||||||
|
# There may be multiple parenthesis (or nested). This
|
||||||
|
# iterates through them and checks if the links html
|
||||||
|
# is present inside these parenthesis.
|
||||||
|
#
|
||||||
|
# Care must be taken with regular expressions as they are
|
||||||
|
# user/developer unfriendly, hard-to-read, and unforgiving.
|
||||||
|
# For example, what happens when you try to match (<anything>)
|
||||||
|
# inside of (some words) some more words (even more words), you
|
||||||
|
# can match unpaired parenthesis and the computer will return
|
||||||
|
# unexpected results. The code is quite dumb and does exactly
|
||||||
|
# what you tell it to.
|
||||||
match_text = match.group(0)
|
match_text = match.group(0)
|
||||||
match_idx = match.end(0)
|
match_idx = match.end(0)
|
||||||
print(link_text)
|
|
||||||
print(match_text)
|
|
||||||
if link_text in match_text:
|
if link_text in match_text:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
match = regex.search(p_text, match_idx+1)
|
match = regex.search(p_text, match_idx+1)
|
||||||
|
|
||||||
# Is the link inside parenthesis?
|
return False
|
||||||
# regex_str = '\([^()]*<a.*?>%s</a>[^())]*\)' % re.escape(link_text)
|
|
||||||
# regex_str = '\(.*<a.*?>%s.*\)' % link_text
|
|
||||||
# print(regex_str)
|
|
||||||
# regex = re.compile(regex_str, flags=re.UNICODE)
|
|
||||||
# match = re.search(regex_str, p_text)
|
|
||||||
# if match: # Pattern is found in the text
|
|
||||||
# print(match.group(0))
|
|
||||||
# return True
|
|
||||||
# else:
|
|
||||||
# return False
|
|
||||||
|
|
||||||
def _is_link_a_footnote(self, el):
|
def _is_link_a_footnote(self, el):
|
||||||
|
# Some links are anchors to footnotes, e.g. [1] that points to a source
|
||||||
|
# at the bottom of the page. These aren't valid links for our purpose
|
||||||
|
# so this method looks for that and determines if the reference element
|
||||||
|
# appears to be a link to a footnote.
|
||||||
href = el.get_attribute('href')
|
href = el.get_attribute('href')
|
||||||
if '#cite_note' in href:
|
if '#cite_note' in href:
|
||||||
return True
|
return True
|
||||||
@ -145,12 +246,18 @@ class ArticlePage(PageRootObject):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def _is_link_pronounciation(self, el):
|
def _is_link_pronounciation(self, el):
|
||||||
|
# Some links point to the wikipedia IPA (international phonetic
|
||||||
|
# alphabet) pronounciation help page. We don't want to click these
|
||||||
|
# links so we scan for and ignore them.
|
||||||
href = el.get_attribute('href')
|
href = el.get_attribute('href')
|
||||||
if '/wiki/Help:IPA' in href:
|
if '/wiki/Help:IPA' in href:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _is_link_audio(self, el):
|
def _is_link_audio(self, el):
|
||||||
|
# Some links are audio playback pronounciations. We look for these
|
||||||
|
# by checking for the file extension .ogg (an audio file format,
|
||||||
|
# ogg-vorbis) and ignoring links if they are of that type.
|
||||||
href = el.get_attribute('href')
|
href = el.get_attribute('href')
|
||||||
if '.ogg' in href:
|
if '.ogg' in href:
|
||||||
return True
|
return True
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
import app.cli
|
import app
|
||||||
import settings
|
import settings
|
||||||
|
|
||||||
app.cli.init(settings.Settings)
|
# Inject the settings.DefaultSettings object into the
|
||||||
app.cli.main()
|
# app and start running the program.
|
||||||
|
app.init(settings.DefaultSettings)
|
||||||
|
app.main()
|
||||||
input('<enter> to exit')
|
input('<enter> to exit')
|
||||||
|
|
||||||
|
12
settings.py
12
settings.py
@ -1,6 +1,16 @@
|
|||||||
|
# Application run-time configuration/settings. This contains variables
|
||||||
|
# that control how the program works but are kept separate from the
|
||||||
|
# program. It makes sense for certain parameters to be adjustable but
|
||||||
|
# not hard-coded into the application. For example, some users may want
|
||||||
|
# to run this program in English while others may want to run in Spanish.
|
||||||
|
# The way this works is we specify those variables external from the
|
||||||
|
# application (here) and pass them into the application (app.config module).
|
||||||
|
# The application then references app.config.obj to access the variables
|
||||||
|
# passed in from here.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
class Settings:
|
class DefaultSettings:
|
||||||
# Application Parameters
|
# Application Parameters
|
||||||
LOG_LEVEL = logging.INFO
|
LOG_LEVEL = logging.INFO
|
||||||
DO_BREAKPOINTS = True
|
DO_BREAKPOINTS = True
|
||||||
|
Loading…
Reference in New Issue
Block a user