mirror of
https://git.zavage.net/Zavage-Software/wikicrawl.git
synced 2024-11-24 17:09:21 -07:00
lots of documentation and safer functionality
This commit is contained in:
parent
64093c58a2
commit
5d88690ded
@ -1,4 +1 @@
|
||||
You need selenium-server installed and running:
|
||||
|
||||
java -jar /usr/share/selenium-server/selenium-server-standalone.jar -timeout 0
|
||||
|
||||
|
@ -1 +1,12 @@
|
||||
# The __init__.py file signals to the python interpreter that the
|
||||
# app directory is a package. A package is a special module that
|
||||
# contains other modules. Each file is a module (browser, cli, etc.)
|
||||
# and the "app" package is a module that contains other modules.
|
||||
|
||||
# The "app" module exports the stuff exposed here. We export
|
||||
# app.init() as a reference to app.config.init() and app.main
|
||||
# as a reference to app.cli.main
|
||||
|
||||
from .config import init
|
||||
from .cli import main
|
||||
|
||||
|
@ -1,13 +1,25 @@
|
||||
# browser module defines functions for creating selenium webdriver
|
||||
# objects. The way this works is selenium is a third-party library
|
||||
# that gives a common interface for interacting with web browsers,
|
||||
# i.e. chrome, firefox, internet explorer, and even a pseudo-browser.
|
||||
#
|
||||
# This library (selenium) creates a web browser process
|
||||
# (chrome will actually fire up for you to see) and gives you
|
||||
# a webdriver object interface to programmatically control the browser.
|
||||
# You can do things like click on links, extract information from the
|
||||
# page, pass control to a user... the limit is your imagination.
|
||||
|
||||
import selenium
|
||||
import selenium.webdriver
|
||||
import logging
|
||||
|
||||
settings = {}
|
||||
|
||||
def init(settings_obj):
|
||||
global settings
|
||||
settings = settings_obj
|
||||
from . import config
|
||||
|
||||
# This function has a parameter (driver) that passes in a value. In this case,
|
||||
# this driver variable defaults to the string 'chrome'. The code can call
|
||||
# create_webdriver() which is the same as create_webdriver('chrome') but
|
||||
# can alternatively call create_webdriver('firefox') and get different
|
||||
# functionality.
|
||||
def create_webdriver(driver='chrome'):
|
||||
if driver == 'chrome':
|
||||
return create_webdriver_chrome()
|
||||
@ -19,9 +31,7 @@ def create_webdriver_firefox():
|
||||
|
||||
def create_webdriver_chrome():
|
||||
opt = selenium.webdriver.chrome.options.Options()
|
||||
opt.add_argument('--user-agent=' + settings.WEBDRIVER_USER_AGENT)
|
||||
opt.add_argument('--kiosk-printing')
|
||||
opt.add_argument("--focus-existing-tab-on-open=false")
|
||||
opt.add_argument('--user-agent=' + config.obj.WEBDRIVER_USER_AGENT)
|
||||
driver = selenium.webdriver.Chrome(chrome_options = opt)
|
||||
return driver
|
||||
|
||||
|
118
app/cli.py
118
app/cli.py
@ -1,4 +1,8 @@
|
||||
#!/usr/bin/env python
|
||||
# The command-line interface module creates a interface for
|
||||
# interacting with the python program (wikicrawl). This is an implementation
|
||||
# of the baker demo shown previously. The user can type in commands to
|
||||
# make the program do things.
|
||||
|
||||
import baker
|
||||
import logging
|
||||
@ -6,18 +10,13 @@ import readline # Needed for command history <up> and <down> arrows to work
|
||||
import sys
|
||||
|
||||
from . import model
|
||||
from . import config
|
||||
|
||||
# Problem pages:
|
||||
# Decision (from politics)
|
||||
# Malaysia (goes inside parenthesis)
|
||||
|
||||
commander = baker.Baker()
|
||||
settings = {}
|
||||
|
||||
def init(settings_obj):
|
||||
global settings
|
||||
settings = settings_obj
|
||||
model.init(settings_obj)
|
||||
|
||||
def main():
|
||||
user_interface = InteractiveInterface()
|
||||
@ -31,9 +30,17 @@ def main():
|
||||
class InteractiveInterface:
|
||||
def __init__(self):
|
||||
self.model = model.Model()
|
||||
x = self.model.webdriver # Request the browser open immediately
|
||||
|
||||
def run(self, args, main=True):
|
||||
"""
|
||||
Runs the command-line interface for a single command.
|
||||
|
||||
If called by InteractiveInterface.run(sys.argv), this method
|
||||
will execute the commands and arguments specified on command
|
||||
line when running this program. Alternatively, the code could
|
||||
pass in a different set of arguments to specify what to do.
|
||||
See start_command_loop() for more information.
|
||||
"""
|
||||
try:
|
||||
commander.run(argv=args, main=True, help_on_error=True,
|
||||
instance=self)
|
||||
@ -49,20 +56,103 @@ class InteractiveInterface:
|
||||
def start_command_loop(self):
|
||||
"""
|
||||
Repeatedly asks the user what command to run until they exit.
|
||||
|
||||
This method calls InteractiveInterface.run(args) a little bit
|
||||
differently. Instead of passing the arguments from the command-line
|
||||
that were passed in when invoking the python wikicrawl app,
|
||||
this asks the user for a line of textual input and passes
|
||||
those strings to run() as the arguments. This way, the user can
|
||||
access an interactive shell and repeatedly issue different
|
||||
commands while the application is running.
|
||||
"""
|
||||
commander.usage()
|
||||
self.model.open_browser()
|
||||
while True:
|
||||
print('$ ', end = '') # Display to the user a command prompt
|
||||
# The dollar-sign is a common indication
|
||||
# of a shell that communicates to the user
|
||||
# that we are waiting for their textual
|
||||
# input. The end = '' indicates to python
|
||||
# to NOT drop to a newline after printing
|
||||
# in the terminal. Instead, let the user
|
||||
# type their command on the same line as
|
||||
# our printed '$ '.
|
||||
try:
|
||||
inp = input()
|
||||
except EOFError: # <ctrl>+D will send "End Line" and exit the command loop
|
||||
break
|
||||
args = ['', ] + inp.split()
|
||||
# Note in arguments (mg):
|
||||
# Whenever a program is run in windows or *nix, the operating
|
||||
# system passes in the command string that was used to invoke
|
||||
# the program. You can append data in that command to configure
|
||||
# switches or values going into the program on the fly. For
|
||||
# example, you can invoke this wikicrawl app in more than one
|
||||
# way. You can of course run "python launcher.py" to run the
|
||||
# software but you can also pass in an argument. You can
|
||||
# alternatively run "python launcher.py <argument> <argument>..."
|
||||
# and the operating system will provide the <argument> values into
|
||||
# the process that is running.
|
||||
#
|
||||
# In a real world use case, many commands provide switches to
|
||||
# adjust what the program does. For example,
|
||||
#
|
||||
# The command:
|
||||
# find music -iname "*justin*bieber*"
|
||||
# runs the "find" program and asks to find all the filenames that match the
|
||||
# pattern *justin*bieber* in the "music" directory.
|
||||
# (music, -iname, "*justin*biever*") are argument parameters
|
||||
# that are passed into the program. The program is coded to
|
||||
# parse and interpret these values and execute differently based
|
||||
# on the values passed in. This is one way to pass in information
|
||||
# into a running program. Some other ways are to read from a file
|
||||
# (such as how we read from settings.py to load the runtime
|
||||
# configuration), from something called environment variables
|
||||
# (won't get into but another set of values provided to programs
|
||||
# from the operating system), or they can be hard-coded into
|
||||
# the application.
|
||||
#
|
||||
# Side note: arguments are not unique to python (almost all
|
||||
# programming languages implement arguments), the functionality
|
||||
# is defined by the application (some programs require arguments,
|
||||
# some are optional, and the syntax for sending in argument
|
||||
# parameters are different and defined by the individual programs,
|
||||
# and lastly, the first argument sent in is the script name or
|
||||
# filename of the script. In our case, the first argument is
|
||||
# the string "launcher.py". If the user invoked the command
|
||||
# as C:\Users\mguest\launcher.py then the first argument
|
||||
# would be C:\Users\mguest\launcher.py.
|
||||
|
||||
if "--help" in args:
|
||||
args.remove("--help")
|
||||
# What this method (start_command_loop()) does is provide a
|
||||
# REPL which is a
|
||||
# read-eval-print-loop. It repeatedly asks the user for an
|
||||
# input (read), evaluates that input into an action (evaluate),
|
||||
# give the user some feedback (print), and start the process
|
||||
# over again (loop). When you call "python", you are given a python
|
||||
# process that gives you a REPL interactive shell. The way
|
||||
# this wikicrawl app is implemented gives the user a REPL
|
||||
# that has commands to interact with wikipedia pages.
|
||||
args = [sys.argv[0], ] + inp.split()
|
||||
|
||||
# The user can at any point in the command pass the argument
|
||||
# switch "--help". If doing this, the command line interface
|
||||
# will instead print out the inline documentation associated
|
||||
# with this command and quit after doing so. For example,
|
||||
# the user can type "python launcher.py do_random_page --help"
|
||||
# and the program will spit out the generated documentation
|
||||
# for the do_random_page command and run nothing. In our case,
|
||||
# this documentation is created by the baker library and will
|
||||
# print out the docstring associated with the method. Try it
|
||||
# out in your shell (cmd.exe or powershell.exe) by invoking
|
||||
# python launcher.py do_random_page --help
|
||||
# You will see the program spit out the heredoc below the
|
||||
# do_random_page method defined below.
|
||||
|
||||
if '--help' in args:
|
||||
args.remove('--help')
|
||||
try:
|
||||
print('command usage:')
|
||||
commander.usage(args[1])
|
||||
return
|
||||
except Exception as ex:
|
||||
print(type(ex), ex)
|
||||
continue
|
||||
@ -71,13 +161,21 @@ class InteractiveInterface:
|
||||
|
||||
@commander.command
|
||||
def do_random_page(self):
|
||||
"""
|
||||
Instructs the wikicrawl application to play the game on a random
|
||||
article.
|
||||
"""
|
||||
self.model.do_random_page()
|
||||
|
||||
@commander.command
|
||||
def do_n_pages(self, n):
|
||||
"""
|
||||
Plays the wikicrawl game <n>-times.
|
||||
"""
|
||||
try:
|
||||
n = int(n)
|
||||
except ValueError as ex:
|
||||
logging.warn('failed to process "%s" as a parameter' % n)
|
||||
return False
|
||||
for i in range(n):
|
||||
self.model.do_random_page()
|
||||
|
13
app/config.py
Normal file
13
app/config.py
Normal file
@ -0,0 +1,13 @@
|
||||
# config module defines a place to store the external configuration/settings
|
||||
# and is used to provide an interface to the runtime configuration for the
|
||||
# program.
|
||||
|
||||
from . import log
|
||||
|
||||
obj = {}
|
||||
|
||||
def init(settings_obj):
|
||||
global obj
|
||||
obj = settings_obj
|
||||
log.init_logging()
|
||||
|
@ -2,11 +2,7 @@ import sqlite3
|
||||
import pycurl
|
||||
import os
|
||||
|
||||
settings = {}
|
||||
|
||||
def init(settings_obj):
|
||||
global settings
|
||||
settings = settings_obj
|
||||
from . import config
|
||||
|
||||
class DataLayer:
|
||||
def __init__(self):
|
||||
|
10
app/log.py
10
app/log.py
@ -1,11 +1,7 @@
|
||||
import logging
|
||||
|
||||
settings = {}
|
||||
|
||||
def init(settings_obj):
|
||||
global settings
|
||||
settings = settings_obj
|
||||
init_logging()
|
||||
from . import config
|
||||
|
||||
def init_logging():
|
||||
logging.basicConfig(level=settings.LOG_LEVEL)
|
||||
logging.basicConfig(level=config.obj.LOG_LEVEL)
|
||||
|
||||
|
92
app/model.py
92
app/model.py
@ -1,43 +1,96 @@
|
||||
# model module contains the business logic of the program. Notice
|
||||
# the command-line interface contains no business logic and only
|
||||
# has functionality to call on the model. Similarly, the page
|
||||
# objects define no specific functionality for doing the wiki crawl
|
||||
# but only provide general utility methods that are called upon
|
||||
# to implement the wiki crawl. This is a separation of concerns
|
||||
# and keeps the logic organized and separated.
|
||||
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
|
||||
from . import browser
|
||||
from . import log
|
||||
from . import config
|
||||
from . import dal
|
||||
from . import log
|
||||
from . import pages
|
||||
|
||||
settings = {}
|
||||
|
||||
def init(settings_obj):
|
||||
global settings
|
||||
settings = settings_obj
|
||||
|
||||
browser.init(settings_obj)
|
||||
dal.init(settings_obj)
|
||||
pages.init(settings_obj)
|
||||
log.init(settings_obj)
|
||||
|
||||
class Model:
|
||||
def __init__(self):
|
||||
self._webdriver = None
|
||||
|
||||
@property
|
||||
def webdriver(self):
|
||||
# The way this works is when an object instance of class/type Model
|
||||
# is called with x.webdriver, Model runs webdriver(). In our case,
|
||||
# the webdriver() method checks if a private variable self._webdriver
|
||||
# exists and if it isn't, asks for a new selenim object. The result
|
||||
# is that this will on-demand create a browser. If one exists, it will
|
||||
# use the one that exists and if one doesn't exists, it will create
|
||||
# one and use that. External code can rely on self.webdriver
|
||||
# always existing with or without knowing if it exists because if it
|
||||
# hasn't been created yet then it will be created on-the-fly.
|
||||
if not self._webdriver:
|
||||
self._webdriver = browser.create_webdriver(settings.WEBDRIVER_BROWSER)
|
||||
self._webdriver = browser.create_webdriver(config.obj.WEBDRIVER_BROWSER)
|
||||
page_api = pages.LandingPage(self.webdriver)
|
||||
page_api.goto_landing_page()
|
||||
return self._webdriver
|
||||
|
||||
def do_random_page(self):
|
||||
# Landing page (select language)
|
||||
page_api = pages.LandingPage(self.webdriver)
|
||||
page_api.goto_landing_page()
|
||||
page_api.select_language(settings.PAGE_LANGUAGE)
|
||||
def open_browser(self):
|
||||
x = self.webdriver # Request the browser open immediately.
|
||||
# Without this, the Model object will
|
||||
# be created on-demand (as defined in
|
||||
# the Model class). This means that the
|
||||
# web browser will not open until
|
||||
# a command is typed in. But because we
|
||||
# request the webdriver right here, Model
|
||||
# creates it and then it is re-used later
|
||||
# in the application.
|
||||
|
||||
# Main page
|
||||
def do_random_page(self):
|
||||
"""
|
||||
Select a random page and repeatedly click the first link until
|
||||
we reach the article on philosophy. Sometimes, the driver encounters
|
||||
a loop and will never reach the page and sometimes the parser
|
||||
fails and we fail to programmatically implement what we're trying to
|
||||
do correctly.
|
||||
"""
|
||||
# The following 3 lines include the functionality
|
||||
# for the Landing page (select language).
|
||||
|
||||
# This line creates a new object (page_api) which is an instance
|
||||
# of type class pages.LandingPage. LandingPage is a variable
|
||||
# containing a class definition that is located in the pages module.
|
||||
# We pass self.webdriver as arguments into the LandingPage.__init__
|
||||
# constructor.
|
||||
page_api = pages.LandingPage(self.webdriver)
|
||||
|
||||
# This line calls the page_api object's (an instance of
|
||||
# pages.LandingPage type) method goto_landing_page.
|
||||
page_api.goto_landing_page()
|
||||
|
||||
# Similarly, this line calls the select_language method
|
||||
# and passes in values from our runtime configuration.
|
||||
# In this case, we have made the language a parameter
|
||||
# that you can pass into the program, i.e. you can run it
|
||||
# for English or Spanish or Russian or what have you.
|
||||
page_api.select_language(config.obj.PAGE_LANGUAGE)
|
||||
|
||||
# Main page: next 2 lines
|
||||
|
||||
# At this point, we have clicked a link and changes the page. We
|
||||
# re-create our page interface as a new object which is of
|
||||
# a different class and includes distinct code for working with
|
||||
# the page. In this case, we delete page_api and re-create it
|
||||
# as an object of type pages.MainPage. Again, we pass in
|
||||
# self.webdriver as an object of the selenium webdriver interface.
|
||||
# The page_api calls methods on this webdriver to make the web
|
||||
# browser do various things like click links or extract text.
|
||||
page_api = pages.MainPage(self.webdriver)
|
||||
|
||||
# We call pages.MainPage.goto_random_article() to perform
|
||||
# the action we're trying to invoke.
|
||||
page_api.goto_random_article()
|
||||
|
||||
# Article page
|
||||
@ -62,4 +115,3 @@ class Model:
|
||||
break
|
||||
print()
|
||||
|
||||
|
||||
|
165
app/pages.py
165
app/pages.py
@ -1,50 +1,109 @@
|
||||
# Pages module defines classes for interacting with wikipedia pages.
|
||||
# There are separate classes defined for each page with their own
|
||||
# defined methods for performing certain actions.
|
||||
|
||||
import logging
|
||||
import re
|
||||
import selenium
|
||||
import selenium.webdriver
|
||||
import time
|
||||
|
||||
settings = {}
|
||||
|
||||
def init(settings_obj):
|
||||
global settings
|
||||
settings = settings_obj
|
||||
from . import browser
|
||||
from . import config
|
||||
|
||||
def breakpoint():
|
||||
if settings.DO_BREAKPOINTS:
|
||||
"""
|
||||
If DO_BREAKPOINTS is switched on, this will pause program
|
||||
execution and wait for the user to press enter to continue.
|
||||
"""
|
||||
if config.obj.DO_BREAKPOINTS:
|
||||
input('Breakpoint here. <Enter> to continue...')
|
||||
|
||||
class PageRootObject:
|
||||
"""
|
||||
Common interface methods for working with pages. The specific
|
||||
page classes below inherit these methods and define additional methods
|
||||
so every page has available these methods and any additional
|
||||
methods they define.
|
||||
|
||||
In here are some re-used methods to click links and highlight
|
||||
elements in the browser.
|
||||
"""
|
||||
def __init__(self, driver=None):
|
||||
"""
|
||||
Object constructor for initializing the instance of this
|
||||
class with internal variables needed.
|
||||
|
||||
Args:
|
||||
driver: Reference to the selenium webdriver object
|
||||
that is used to interface with the web browser.
|
||||
"""
|
||||
if not driver:
|
||||
self.driver = create_webdriver()
|
||||
self.driver = browser.create_webdriver()
|
||||
else:
|
||||
self.driver = driver
|
||||
|
||||
def click(self, el):
|
||||
"""
|
||||
Clicks a link in the browser and also highlights it to the
|
||||
end user.
|
||||
|
||||
Args:
|
||||
el: selenium element to be clicked. Typically an anchor
|
||||
html link in the page.
|
||||
"""
|
||||
self.highlight(el, 'red')
|
||||
time.sleep(settings.PAGE_DELAY)
|
||||
time.sleep(config.obj.PAGE_DELAY)
|
||||
breakpoint()
|
||||
el.click()
|
||||
|
||||
def highlight(self, el, color):
|
||||
"""
|
||||
Highlights an html element in the web browser by changing the
|
||||
background color as well as making the text bold.
|
||||
|
||||
The implementation uses javascript to alter the css of the element.
|
||||
|
||||
Args:
|
||||
el: selenium element to be highlighted.
|
||||
color: background color to highlight. Input can be one of
|
||||
'red', 'blue', or hex code such as '#ffffff'.
|
||||
"""
|
||||
# Note: The way hex codes work is there are 1 byte (2 hex characters)
|
||||
# for every color. #RRGGBB for (red, green, blue). This can be thought
|
||||
# of as an integer 0-255 for red, green, and blue in base-16 hexadecimal.
|
||||
if color == 'red':
|
||||
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292'
|
||||
elif color == 'blue':
|
||||
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
|
||||
else:
|
||||
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
|
||||
self.driver.execute_script(js, el)
|
||||
|
||||
class LandingPage(PageRootObject):
|
||||
"""
|
||||
Interface for working with the wikipedia.org landing page. This page has links to
|
||||
select a language and go to the respective wikipedia root page.
|
||||
"""
|
||||
|
||||
# Note: This is the LandingPage() object constructor. All it does right now is
|
||||
# reference the parent (PageRootObject) constructor method and call it. This
|
||||
# calls PageRootObject.__init__(driver) which makes the web driver available
|
||||
# in the object instance.
|
||||
def __init__(self, driver=None):
|
||||
super().__init__(driver)
|
||||
|
||||
def goto_landing_page(self):
|
||||
self.driver.get(settings.PAGE_BASE_URL)
|
||||
self.driver.get(config.obj.PAGE_BASE_URL)
|
||||
|
||||
def select_language(self, language):
|
||||
link = self.driver.find_element_by_partial_link_text(language)
|
||||
self.click(link)
|
||||
|
||||
class MainPage(PageRootObject):
|
||||
"""
|
||||
Interface for a selected language root page. This has the link to go to a random article
|
||||
and has a featured article. An example url for this is https://en.wikipedia.org.
|
||||
"""
|
||||
def __init__(self, driver=None):
|
||||
super().__init__(driver)
|
||||
|
||||
@ -53,7 +112,17 @@ class MainPage(PageRootObject):
|
||||
self.click(link)
|
||||
|
||||
class ArticlePage(PageRootObject):
|
||||
"""
|
||||
Interface for a wikipedia article page. Here are defined some utility methods to
|
||||
try and click the first valid link and extract some information from the page.
|
||||
"""
|
||||
|
||||
# Here are static class-scoped variables that are needed to work with the page.
|
||||
# These are used to locate html elements in the web browser. There are many
|
||||
# ways to locate elements but one of the best if available is locating by id. It's
|
||||
# not enforced but the html specification mandates that element id's are unique
|
||||
# so if you can select by id in a semanticly correct web page, you can correctly
|
||||
# select unique elements with high confidence.
|
||||
elements = {
|
||||
'main-window-content-text-id': 'mw-content-text',
|
||||
'article-title': 'firstHeading',
|
||||
@ -69,7 +138,24 @@ class ArticlePage(PageRootObject):
|
||||
def click_first_link(self):
|
||||
return self._iterate_paragraphs()
|
||||
|
||||
# Note: Here this method has it's name prepended with a single underscore.
|
||||
# This is a convention that communicates to the developer that these methods
|
||||
# are internal private methods. That means they are not meant to be exposed
|
||||
# to the external interface. Python does not restrict calling these methods.
|
||||
# You can still call ArticlePage._iterate_paragraphs() but the prefix
|
||||
# underscore tells you that it is not intended to be exposed and may be
|
||||
# unsafe to call. Depending on the implementation, it may not make sense
|
||||
# to directly call this method and may result in undefined and unexpected
|
||||
# behavior. _iterate_paragraphs is called internally from the exposed
|
||||
# click_first_link() but is never invoked externally.
|
||||
def _iterate_paragraphs(self):
|
||||
"""
|
||||
Iterates through paragraphs in the page and attempts to find the first
|
||||
valid link. Sometimes the first paragraph does not have a link so this
|
||||
needs to go through a few paragraphs and it does not make sense to
|
||||
operate on the entire article every time when we're just looking for
|
||||
the first link, for performance optimization.
|
||||
"""
|
||||
main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id'])
|
||||
paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]')
|
||||
for p in paragraphs:
|
||||
@ -89,8 +175,7 @@ class ArticlePage(PageRootObject):
|
||||
continue
|
||||
self.highlight(link, 'red')
|
||||
logging.info('selected link: %s' % link.text)
|
||||
breakpoint()
|
||||
link.click()
|
||||
self.click(link)
|
||||
return True
|
||||
|
||||
def _is_valid_link(self, p, el):
|
||||
@ -98,45 +183,61 @@ class ArticlePage(PageRootObject):
|
||||
b = self._is_link_a_footnote(el)
|
||||
c = self._is_link_pronounciation(el)
|
||||
d = self._is_link_audio(el)
|
||||
print(a, b, c, d)
|
||||
if not a and not b and not c and not d:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _is_link_in_parenthesis(self, p, el):
|
||||
# link_text = el.text
|
||||
"""
|
||||
Determine if a given link element is inside a set
|
||||
of textual parenthesis.
|
||||
"""
|
||||
# Implementation notes (mg):
|
||||
# I've tried a few different ways to do this and it's
|
||||
# hard to get it to work in every case. I want to avoid
|
||||
# certain links and usually avoid links inside parenthetical
|
||||
# notes. Some edge cases are nested parenthesis, links with
|
||||
# non-english characters (which are displayed with a tree
|
||||
# of elements in the html rather than a simply link). And
|
||||
# sometimes, the link inside the parenthesis may be a valid
|
||||
# target. I've made it so that skipped links show up as blue
|
||||
# and determined-valid links highlight as red.
|
||||
link_text = el.get_attribute('outerHTML')
|
||||
p_text = p.get_attribute('innerHTML')
|
||||
|
||||
regex_str = '\(.*?\)'
|
||||
regex_str = '\(.*?\)' # Regular expression to extract the
|
||||
# text inside (not nested) parenthesis
|
||||
regex = re.compile(regex_str, flags=re.UNICODE)
|
||||
match = regex.search(p_text)
|
||||
if not match:
|
||||
# There are no parenthesis at all in this paragraph.
|
||||
return False
|
||||
|
||||
while match is not None:
|
||||
# There may be multiple parenthesis (or nested). This
|
||||
# iterates through them and checks if the links html
|
||||
# is present inside these parenthesis.
|
||||
#
|
||||
# Care must be taken with regular expressions as they are
|
||||
# user/developer unfriendly, hard-to-read, and unforgiving.
|
||||
# For example, what happens when you try to match (<anything>)
|
||||
# inside of (some words) some more words (even more words), you
|
||||
# can match unpaired parenthesis and the computer will return
|
||||
# unexpected results. The code is quite dumb and does exactly
|
||||
# what you tell it to.
|
||||
match_text = match.group(0)
|
||||
match_idx = match.end(0)
|
||||
print(link_text)
|
||||
print(match_text)
|
||||
if link_text in match_text:
|
||||
return True
|
||||
|
||||
match = regex.search(p_text, match_idx+1)
|
||||
|
||||
# Is the link inside parenthesis?
|
||||
# regex_str = '\([^()]*<a.*?>%s</a>[^())]*\)' % re.escape(link_text)
|
||||
# regex_str = '\(.*<a.*?>%s.*\)' % link_text
|
||||
# print(regex_str)
|
||||
# regex = re.compile(regex_str, flags=re.UNICODE)
|
||||
# match = re.search(regex_str, p_text)
|
||||
# if match: # Pattern is found in the text
|
||||
# print(match.group(0))
|
||||
# return True
|
||||
# else:
|
||||
# return False
|
||||
return False
|
||||
|
||||
def _is_link_a_footnote(self, el):
|
||||
# Some links are anchors to footnotes, e.g. [1] that points to a source
|
||||
# at the bottom of the page. These aren't valid links for our purpose
|
||||
# so this method looks for that and determines if the reference element
|
||||
# appears to be a link to a footnote.
|
||||
href = el.get_attribute('href')
|
||||
if '#cite_note' in href:
|
||||
return True
|
||||
@ -145,12 +246,18 @@ class ArticlePage(PageRootObject):
|
||||
return False
|
||||
|
||||
def _is_link_pronounciation(self, el):
|
||||
# Some links point to the wikipedia IPA (international phonetic
|
||||
# alphabet) pronounciation help page. We don't want to click these
|
||||
# links so we scan for and ignore them.
|
||||
href = el.get_attribute('href')
|
||||
if '/wiki/Help:IPA' in href:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _is_link_audio(self, el):
|
||||
# Some links are audio playback pronounciations. We look for these
|
||||
# by checking for the file extension .ogg (an audio file format,
|
||||
# ogg-vorbis) and ignoring links if they are of that type.
|
||||
href = el.get_attribute('href')
|
||||
if '.ogg' in href:
|
||||
return True
|
||||
|
@ -1,6 +1,9 @@
|
||||
import app.cli
|
||||
import app
|
||||
import settings
|
||||
|
||||
app.cli.init(settings.Settings)
|
||||
app.cli.main()
|
||||
# Inject the settings.DefaultSettings object into the
|
||||
# app and start running the program.
|
||||
app.init(settings.DefaultSettings)
|
||||
app.main()
|
||||
input('<enter> to exit')
|
||||
|
||||
|
12
settings.py
12
settings.py
@ -1,6 +1,16 @@
|
||||
# Application run-time configuration/settings. This contains variables
|
||||
# that control how the program works but are kept separate from the
|
||||
# program. It makes sense for certain parameters to be adjustable but
|
||||
# not hard-coded into the application. For example, some users may want
|
||||
# to run this program in English while others may want to run in Spanish.
|
||||
# The way this works is we specify those variables external from the
|
||||
# application (here) and pass them into the application (app.config module).
|
||||
# The application then references app.config.obj to access the variables
|
||||
# passed in from here.
|
||||
|
||||
import logging
|
||||
|
||||
class Settings:
|
||||
class DefaultSettings:
|
||||
# Application Parameters
|
||||
LOG_LEVEL = logging.INFO
|
||||
DO_BREAKPOINTS = True
|
||||
|
Loading…
Reference in New Issue
Block a user