mirror of
https://git.zavage.net/Zavage-Software/wikicrawl.git
synced 2024-11-24 00:59:19 -07:00
colored logging, multiple selenium drivers, multi language support
This commit is contained in:
parent
5d88690ded
commit
4aa965cfc8
@ -1,12 +0,0 @@
|
||||
# The __init__.py file signals to the python interpreter that the
|
||||
# app directory is a package. A package is a special module that
|
||||
# contains other modules. Each file is a module (browser, cli, etc.)
|
||||
# and the "app" package is a module that contains other modules.
|
||||
|
||||
# The "app" module exports the stuff exposed here. We export
|
||||
# app.init() as a reference to app.config.init() and app.main
|
||||
# as a reference to app.cli.main
|
||||
|
||||
from .config import init
|
||||
from .cli import main
|
||||
|
@ -1,7 +0,0 @@
|
||||
import logging
|
||||
|
||||
from . import config
|
||||
|
||||
def init_logging():
|
||||
logging.basicConfig(level=config.obj.LOG_LEVEL)
|
||||
|
@ -1,9 +1,9 @@
|
||||
import app
|
||||
import wikicrawl
|
||||
import settings
|
||||
|
||||
# Inject the settings.DefaultSettings object into the
|
||||
# app and start running the program.
|
||||
app.init(settings.DefaultSettings)
|
||||
app.main()
|
||||
wikicrawl.init(settings.DefaultSettings)
|
||||
wikicrawl.main()
|
||||
input('<enter> to exit')
|
||||
|
||||
|
99
settings.py
99
settings.py
@ -4,28 +4,107 @@
|
||||
# not hard-coded into the application. For example, some users may want
|
||||
# to run this program in English while others may want to run in Spanish.
|
||||
# The way this works is we specify those variables external from the
|
||||
# application (here) and pass them into the application (app.config module).
|
||||
# The application then references app.config.obj to access the variables
|
||||
# application (here) and pass them into the application (wikicrawl.config module).
|
||||
# The application then references wikicrawl.config.obj to access the variables
|
||||
# passed in from here.
|
||||
|
||||
import colorlog
|
||||
import logging
|
||||
import logging.config
|
||||
|
||||
class DefaultSettings:
|
||||
# Filepath parameters - THESE MUST EXIST OR PROGRAM WILL NOT RUN!!
|
||||
LOG_FILENAME = '/tmp/wikicrawl.log'
|
||||
SQLITE_DBFILE = '/home/mathew/.wikicrawler.db'
|
||||
|
||||
# Application Parameters
|
||||
LOG_LEVEL = logging.INFO
|
||||
DO_BREAKPOINTS = True
|
||||
DO_BREAKPOINTS = False
|
||||
PAGE_DELAY = 0
|
||||
|
||||
# Web Driver Parameters
|
||||
WEBDRIVER_USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)'
|
||||
WEBDRIVER_BROWSER = 'chrome' # Options are 'chrome', 'firefox'
|
||||
|
||||
# Requested browser and webdriver dependencies are required for this to work.
|
||||
# This means you need to have installed on your system:
|
||||
# Chrome + WebDriver for Chrome
|
||||
# Firefox + geckodriver for Firefox
|
||||
# phantomjs for phantom
|
||||
WEBDRIVER_BROWSER = 'chrome' # Options are 'chrome', 'firefox', 'phantom'
|
||||
|
||||
# Wikipedia Parameters
|
||||
PAGE_BASE_URL = 'https://www.wikipedia.org/'
|
||||
PAGE_LANGUAGE = 'English'
|
||||
# PAGE_LANGUAGE = 'Español'
|
||||
# PAGE_LANGUAGE = 'Русский'
|
||||
|
||||
# Data Layer Parameters
|
||||
SQLITE_DBFILE = '/home/mathew/.wikicrawler.db'
|
||||
# Supported Languages so far:
|
||||
# German, English, Spanish, French, Italian, Portuguese, Polish, Russian
|
||||
# 'de', 'en', 'es', 'fr', 'it', 'pl', 'pt', 'ru'
|
||||
PAGE_LANGUAGE = 'en'
|
||||
|
||||
# API Keys
|
||||
YANDEX_API_KEY = 'trnsl.1.1.20170825T194642Z.26862b9dd4c1a755.9490ed28de448ff67522c2854f262eff05ec0dc3'
|
||||
|
||||
# Logging Parameters
|
||||
LOG_SETTINGS = {
|
||||
'version': 1, # version is always 1
|
||||
'formatters': {
|
||||
'colored': {
|
||||
'()': 'colorlog.ColoredFormatter',
|
||||
'format': '%(log_color)s%(levelname)-8s%(reset)s:%(log_color)s%(name)-5s%(reset)s:%(blue)s%(message)s'
|
||||
},
|
||||
'basic': {
|
||||
'()': 'logging.Formatter',
|
||||
'format': '%(levelname)s:%(name)s:%(asctime)s:%(message)s'
|
||||
}
|
||||
},
|
||||
'handlers': {
|
||||
'stderr': {
|
||||
'class': 'logging.StreamHandler',
|
||||
# The handler level will override the logger level if higher.
|
||||
# That is, if the logger level is set to pass through DEBUG
|
||||
# and higher and the handler is set to only pass through WARNING
|
||||
# and higher, DEBUG messages will not pass through to this loggers
|
||||
# handler. You can configure multiple handlers for any logger so
|
||||
# for example you could log WARNINGS and ERRORS to a file but
|
||||
# not save all the DEBUG messages.
|
||||
'level': logging.DEBUG,
|
||||
'formatter': 'colored'
|
||||
},
|
||||
'file': {
|
||||
'class': 'logging.handlers.RotatingFileHandler',
|
||||
'level': logging.INFO,
|
||||
'formatter': 'basic',
|
||||
'filename': LOG_FILENAME,
|
||||
'maxBytes': 32768,
|
||||
'backupCount': 3
|
||||
}
|
||||
},
|
||||
'loggers': {
|
||||
# Root Logger
|
||||
'': {
|
||||
'level': logging.DEBUG,
|
||||
'handlers': ['file'],
|
||||
},
|
||||
'main': {
|
||||
'level': logging.DEBUG,
|
||||
'handlers': ['stderr'],
|
||||
'propagate': False
|
||||
},
|
||||
'model': {
|
||||
'level': logging.DEBUG,
|
||||
'handlers': ['stderr'],
|
||||
'propagate': True
|
||||
},
|
||||
'cli': {
|
||||
'level': logging.DEBUG,
|
||||
'handlers': ['stderr'],
|
||||
'propagate': False
|
||||
},
|
||||
'pages': {
|
||||
'level': logging.INFO,
|
||||
'handlers': ['stderr'],
|
||||
'propagate': False
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
23
setup.py
23
setup.py
@ -0,0 +1,23 @@
|
||||
#!/usr/bin/env python
|
||||
# setup.py is the install script for this application. This will download
|
||||
# required third-party dependencies and package the app. You can also
|
||||
# install the application system-wide.
|
||||
|
||||
from setuptools import setup
|
||||
|
||||
__project__ = 'wikicrawl'
|
||||
# If you're looking for a versioning scheme, one revered pattern
|
||||
# can be read about at http://semver.org
|
||||
__version__ = '0.9.0'
|
||||
|
||||
setup(name = __project__,
|
||||
version = __version__,
|
||||
description = '',
|
||||
author = '',
|
||||
author_email = '',
|
||||
url = '',
|
||||
install_requires = ('yandex.translate',
|
||||
'selenium',
|
||||
),
|
||||
packages = ('wikicrawl',))
|
||||
|
BIN
wikicrawl/.dal.py.swp
Normal file
BIN
wikicrawl/.dal.py.swp
Normal file
Binary file not shown.
12
wikicrawl/__init__.py
Normal file
12
wikicrawl/__init__.py
Normal file
@ -0,0 +1,12 @@
|
||||
# The __init__.py file signals to the python interpreter that the
|
||||
# app directory is a package. A package is a special module that
|
||||
# contains other modules. Each file is a module (browser, cli, etc.)
|
||||
# and the "wikicrawl" package is a module that contains other modules.
|
||||
|
||||
# The wikicrawl package, which is a module, exports the stuff exposed here.
|
||||
# We export config.init() as a reference to wikicrawl.config.init() and
|
||||
# wikicrawl.main as a reference to wikicrawl.cli.main
|
||||
|
||||
from .config import init
|
||||
from .main import main
|
||||
|
39
wikicrawl/assets/languages.py
Normal file
39
wikicrawl/assets/languages.py
Normal file
@ -0,0 +1,39 @@
|
||||
|
||||
LANGUAGES = {
|
||||
'az': '',
|
||||
'be': '',
|
||||
'bg': '',
|
||||
'ca': '',
|
||||
'cs': '',
|
||||
'da': '',
|
||||
'de': 'Deutsch',
|
||||
'el': '',
|
||||
'en': 'English',
|
||||
'es': 'Español',
|
||||
'et': '',
|
||||
'fi': '',
|
||||
'fr': 'Français',
|
||||
'hr': '',
|
||||
'hu': '',
|
||||
'hy': '',
|
||||
'it': 'Italiano',
|
||||
# 'ja': '日本語', -- no japanese in yandex
|
||||
'lt': '',
|
||||
'lv': '',
|
||||
'mk': '',
|
||||
'nl': '',
|
||||
'no': '',
|
||||
'pl': 'Polski',
|
||||
'pt': 'Português',
|
||||
'ro': '',
|
||||
'ru': 'Русский',
|
||||
'sk': '',
|
||||
'sl': '',
|
||||
'sq': '',
|
||||
'sr': '',
|
||||
'sv': '',
|
||||
'tr': '',
|
||||
'uk': '',
|
||||
# 'zh': '中文' -- no chinese
|
||||
}
|
||||
|
@ -11,9 +11,9 @@
|
||||
|
||||
import selenium
|
||||
import selenium.webdriver
|
||||
import logging
|
||||
|
||||
from . import config
|
||||
from . import log
|
||||
|
||||
# This function has a parameter (driver) that passes in a value. In this case,
|
||||
# this driver variable defaults to the string 'chrome'. The code can call
|
||||
@ -25,9 +25,17 @@ def create_webdriver(driver='chrome'):
|
||||
return create_webdriver_chrome()
|
||||
elif driver == 'firefox':
|
||||
return create_webdriver_firefox()
|
||||
elif driver == 'phantom':
|
||||
return create_webdriver_phantom()
|
||||
else:
|
||||
log.LOGGER('browser').error('unable to handle webdriver request: %s' % driver)
|
||||
return
|
||||
|
||||
def create_webdriver_firefox():
|
||||
pass
|
||||
profile = selenium.webdriver.FirefoxProfile()
|
||||
profile.set_preference("general.useragent.override", config.obj.WEBDRIVER_USER_AGENT)
|
||||
driver = selenium.webdriver.Firefox(profile)
|
||||
return driver
|
||||
|
||||
def create_webdriver_chrome():
|
||||
opt = selenium.webdriver.chrome.options.Options()
|
||||
@ -35,3 +43,7 @@ def create_webdriver_chrome():
|
||||
driver = selenium.webdriver.Chrome(chrome_options = opt)
|
||||
return driver
|
||||
|
||||
def create_webdriver_phantom():
|
||||
driver = selenium.webdriver.PhantomJS()
|
||||
return driver
|
||||
|
@ -1,37 +1,42 @@
|
||||
#!/usr/bin/env python
|
||||
# The command-line interface module creates a interface for
|
||||
# The command-line interface module creates an interface for
|
||||
# interacting with the python program (wikicrawl). This is an implementation
|
||||
# of the baker demo shown previously. The user can type in commands to
|
||||
# make the program do things.
|
||||
|
||||
import baker
|
||||
import logging
|
||||
import readline # Needed for command history <up> and <down> arrows to work
|
||||
import sys
|
||||
|
||||
|
||||
if sys.platform == 'linux':
|
||||
import readline # Needed for command history <up> and <down> arrows to work
|
||||
|
||||
from . import log
|
||||
from . import model
|
||||
from . import config
|
||||
|
||||
# Problem pages:
|
||||
# Decision (from politics)
|
||||
# Malaysia (goes inside parenthesis)
|
||||
# Soft-sediment_deformation_structures (doesn't find link)
|
||||
# Chemicals (loops at philosophical)
|
||||
|
||||
commander = baker.Baker()
|
||||
|
||||
def main():
|
||||
user_interface = InteractiveInterface()
|
||||
|
||||
if len(sys.argv) > 1: # Command line arguments were passed in
|
||||
# command-line when invoking python
|
||||
user_interface.run(sys.argv)
|
||||
else:
|
||||
user_interface.start_command_loop()
|
||||
|
||||
class InteractiveInterface:
|
||||
def __init__(self):
|
||||
# Instantiate the variable self.model as an object
|
||||
# of instance of the Model class defined in the model
|
||||
# module. model.Model refers to the Model class in the
|
||||
# model module and this line creates a new variable (self.model)
|
||||
# which is a variable that is an instance of Model, i.e.
|
||||
# it has the type Model and has Model.methods() available
|
||||
# to it.
|
||||
#
|
||||
# self.model is a variable that is attached to the instance/object
|
||||
# returned by this constructor that has the type InteractiveInterface.
|
||||
self.model = model.Model()
|
||||
|
||||
def run(self, args, main=True):
|
||||
def run_command(self, args, main=True):
|
||||
"""
|
||||
Runs the command-line interface for a single command.
|
||||
|
||||
@ -45,13 +50,13 @@ class InteractiveInterface:
|
||||
commander.run(argv=args, main=True, help_on_error=True,
|
||||
instance=self)
|
||||
except baker.CommandError as ex:
|
||||
logging.warn('incorrect user input: %s' % ex)
|
||||
log.LOGGER['cli'].warn('incorrect user input: %s' % ex)
|
||||
commander.usage()
|
||||
except baker.TopHelp as ex:
|
||||
commander.usage()
|
||||
except Exception as ex:
|
||||
logging.error('caught general exception!!')
|
||||
print(type(ex), ex)
|
||||
log.LOGGER['cli'].error('caught general exception!!')
|
||||
log.LOGGER['cli'].error(type(ex), ex)
|
||||
|
||||
def start_command_loop(self):
|
||||
"""
|
||||
@ -76,12 +81,13 @@ class InteractiveInterface:
|
||||
# to NOT drop to a newline after printing
|
||||
# in the terminal. Instead, let the user
|
||||
# type their command on the same line as
|
||||
# our printed '$ '.
|
||||
# the printed '$ '.
|
||||
try:
|
||||
inp = input()
|
||||
except EOFError: # <ctrl>+D will send "End Line" and exit the command loop
|
||||
break
|
||||
# Note in arguments (mg):
|
||||
|
||||
# Note on "arguments" (mg):
|
||||
# Whenever a program is run in windows or *nix, the operating
|
||||
# system passes in the command string that was used to invoke
|
||||
# the program. You can append data in that command to configure
|
||||
@ -91,16 +97,16 @@ class InteractiveInterface:
|
||||
# software but you can also pass in an argument. You can
|
||||
# alternatively run "python launcher.py <argument> <argument>..."
|
||||
# and the operating system will provide the <argument> values into
|
||||
# the process that is running.
|
||||
# the process that is running as variables.
|
||||
#
|
||||
# In a real world use case, many commands provide switches to
|
||||
# adjust what the program does. For example,
|
||||
#
|
||||
# The command:
|
||||
# find music -iname "*justin*bieber*"
|
||||
# find music -name "*justin*bieber*"
|
||||
# runs the "find" program and asks to find all the filenames that match the
|
||||
# pattern *justin*bieber* in the "music" directory.
|
||||
# (music, -iname, "*justin*biever*") are argument parameters
|
||||
# (music, -name, "*justin*biever*") are argument parameters
|
||||
# that are passed into the program. The program is coded to
|
||||
# parse and interpret these values and execute differently based
|
||||
# on the values passed in. This is one way to pass in information
|
||||
@ -123,14 +129,21 @@ class InteractiveInterface:
|
||||
# would be C:\Users\mguest\launcher.py.
|
||||
|
||||
# What this method (start_command_loop()) does is provide a
|
||||
# REPL which is a
|
||||
# REPL shell which is a
|
||||
# read-eval-print-loop. It repeatedly asks the user for an
|
||||
# input (read), evaluates that input into an action (evaluate),
|
||||
# give the user some feedback (print), and start the process
|
||||
# over again (loop). When you call "python", you are given a python
|
||||
# process that gives you a REPL interactive shell. The way
|
||||
# over again (loop). When you call just "python", you are loading a
|
||||
# program that gives you a REPL interactive shell. The way
|
||||
# this wikicrawl app is implemented gives the user a REPL
|
||||
# that has commands to interact with wikipedia pages.
|
||||
|
||||
# Because we take in the input as a single string, we do
|
||||
# a transformation to turn something like "do_random_page 5"
|
||||
# into ["launcher.py", "do_random_page", "5"] which is how
|
||||
# the arguments array would have been created if it were
|
||||
# passed in the initial command instead of typed and interpretted
|
||||
# as input as is done here.
|
||||
args = [sys.argv[0], ] + inp.split()
|
||||
|
||||
# The user can at any point in the command pass the argument
|
||||
@ -146,40 +159,42 @@ class InteractiveInterface:
|
||||
# python launcher.py do_random_page --help
|
||||
# You will see the program spit out the heredoc below the
|
||||
# do_random_page method defined below.
|
||||
|
||||
if '--help' in args:
|
||||
args.remove('--help')
|
||||
try:
|
||||
print('command usage:')
|
||||
commander.usage(args[1])
|
||||
return
|
||||
except Exception as ex:
|
||||
print(type(ex), ex)
|
||||
continue
|
||||
|
||||
self.run(args, main=False)
|
||||
self.run_command(args, main=False)
|
||||
|
||||
@commander.command
|
||||
def do_random_page(self):
|
||||
def play_random_page(self):
|
||||
"""
|
||||
Instructs the wikicrawl application to play the game on a random
|
||||
article.
|
||||
"""
|
||||
self.model.do_random_page()
|
||||
self.model.play_random_page()
|
||||
|
||||
@commander.command
|
||||
def do_n_pages(self, n):
|
||||
def play_multiple(self, n):
|
||||
"""
|
||||
Plays the wikicrawl game <n>-times.
|
||||
"""
|
||||
try:
|
||||
n = int(n)
|
||||
except ValueError as ex:
|
||||
logging.warn('failed to process "%s" as a parameter' % n)
|
||||
log.LOGGER['cli'].warn('failed to process "%s" as a parameter' % n)
|
||||
return False
|
||||
for i in range(n):
|
||||
self.model.do_random_page()
|
||||
self.model.play_random_page()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@commander.command
|
||||
def exit(self):
|
||||
"""
|
||||
Immediately exit the program.
|
||||
"""
|
||||
sys.exit(0)
|
||||
|
33
wikicrawl/log.py
Normal file
33
wikicrawl/log.py
Normal file
@ -0,0 +1,33 @@
|
||||
# log module is a wrapper around third-party colorlog library
|
||||
# and provides an application-level interface to a logging system.
|
||||
|
||||
import colorlog
|
||||
import logging
|
||||
|
||||
from . import config
|
||||
|
||||
# Default python log severity levels:
|
||||
# CRITICAL
|
||||
# ERROR
|
||||
# WARNING
|
||||
# INFO
|
||||
# DEBUG
|
||||
|
||||
LOGGER = None
|
||||
|
||||
class LoggingLayer:
|
||||
def __init__(self, config):
|
||||
self.loggers = {}
|
||||
logging.config.dictConfig(config)
|
||||
|
||||
def __getitem__(self, k):
|
||||
logger = self.loggers.get(k)
|
||||
if not logger:
|
||||
logger = logging.getLogger(k)
|
||||
self.loggers[k] = logger
|
||||
return logger
|
||||
|
||||
def init_logging():
|
||||
global LOGGER
|
||||
LOGGER = LoggingLayer(config.obj.LOG_SETTINGS)
|
||||
|
19
wikicrawl/main.py
Normal file
19
wikicrawl/main.py
Normal file
@ -0,0 +1,19 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import sys
|
||||
|
||||
from . import cli
|
||||
from . import util
|
||||
|
||||
def main():
|
||||
user_interface = cli.InteractiveInterface()
|
||||
|
||||
if len(sys.argv) > 1: # Command line arguments were passed in
|
||||
# command-line when invoking python
|
||||
user_interface.run_command(sys.argv)
|
||||
else:
|
||||
user_interface.start_command_loop()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
@ -6,7 +6,6 @@
|
||||
# to implement the wiki crawl. This is a separation of concerns
|
||||
# and keeps the logic organized and separated.
|
||||
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
|
||||
@ -15,10 +14,12 @@ from . import config
|
||||
from . import dal
|
||||
from . import log
|
||||
from . import pages
|
||||
from . import util
|
||||
|
||||
class Model:
|
||||
def __init__(self):
|
||||
self._webdriver = None
|
||||
self._translated_philosophy = None
|
||||
|
||||
@property
|
||||
def webdriver(self):
|
||||
@ -37,6 +38,16 @@ class Model:
|
||||
page_api.goto_landing_page()
|
||||
return self._webdriver
|
||||
|
||||
@property
|
||||
def translated_philosophy(self):
|
||||
# This translates 'philosophy' to the target language with only 1 api call.
|
||||
if config.obj.PAGE_LANGUAGE == 'en':
|
||||
self._translated_philosophy = 'philosophy'
|
||||
elif not self._translated_philosophy:
|
||||
text = util.translate_text('en', config.obj.PAGE_LANGUAGE, 'philosophy')
|
||||
self._translated_philosophy = text
|
||||
return self._translated_philosophy
|
||||
|
||||
def open_browser(self):
|
||||
x = self.webdriver # Request the browser open immediately.
|
||||
# Without this, the Model object will
|
||||
@ -48,7 +59,7 @@ class Model:
|
||||
# creates it and then it is re-used later
|
||||
# in the application.
|
||||
|
||||
def do_random_page(self):
|
||||
def play_random_page(self):
|
||||
"""
|
||||
Select a random page and repeatedly click the first link until
|
||||
we reach the article on philosophy. Sometimes, the driver encounters
|
||||
@ -93,25 +104,50 @@ class Model:
|
||||
# the action we're trying to invoke.
|
||||
page_api.goto_random_article()
|
||||
|
||||
# Article page
|
||||
# Article pages
|
||||
pages_visited = []
|
||||
|
||||
# We just need translated_title to exist
|
||||
translated_title = None
|
||||
while True:
|
||||
page_api = pages.ArticlePage(self.webdriver)
|
||||
|
||||
# Get the article title (and translate if necessary)
|
||||
title = page_api.get_title()
|
||||
logging.debug('visited page: %s' % title)
|
||||
if config.obj.PAGE_LANGUAGE != 'en':
|
||||
translated_title = util.translate_text(config.obj.PAGE_LANGUAGE, 'en', title)
|
||||
log.LOGGER['model'].info('visited page: %s (%s)' % (title, translated_title))
|
||||
else:
|
||||
log.LOGGER['model'].info('visited page: %s' % title)
|
||||
|
||||
# Check for page loops (have we already visisted this page?)
|
||||
if title in pages_visited:
|
||||
logging.info('encountered loop at page = %s' % title)
|
||||
log.LOGGER['model'].info('encountered loop at page = %s' % title)
|
||||
break
|
||||
if title == 'Philosophy':
|
||||
logging.info('made it to philosophy in %s pages' % len(pages_visited))
|
||||
|
||||
# Check if we reached the article on philosophy
|
||||
if self._is_article_on_philosophy(title, translated_title):
|
||||
log.LOGGER['model'].info('made it to philosophy in %s pages' % len(pages_visited))
|
||||
pages_visited.append(title)
|
||||
break
|
||||
|
||||
# Store the result of what articles have been navigated
|
||||
pages_visited.append(title)
|
||||
|
||||
rc = page_api.click_first_link()
|
||||
if not rc:
|
||||
logging.warn('failure: unable to continue (perhaps no valid links?)')
|
||||
log.LOGGER['model'].warn('failure: unable to continue (perhaps no valid links?)')
|
||||
break
|
||||
print()
|
||||
|
||||
def _is_article_on_philosophy(self, title, translated_title):
|
||||
"""
|
||||
Checks both the original title and the translated (to english) title to
|
||||
see if they seem to be the page on philosophy.
|
||||
"""
|
||||
if title.lower() == self.translated_philosophy.lower():
|
||||
return True
|
||||
if translated_title and translated_title.lower() == 'philosophy':
|
||||
return True
|
||||
return False
|
||||
|
@ -1,22 +1,16 @@
|
||||
# Pages module defines classes for interacting with wikipedia pages.
|
||||
# pages module defines classes for interacting with wikipedia pages.
|
||||
# There are separate classes defined for each page with their own
|
||||
# defined methods for performing certain actions.
|
||||
|
||||
import logging
|
||||
import re
|
||||
import selenium
|
||||
import time
|
||||
|
||||
from . import browser
|
||||
from . import config
|
||||
|
||||
def breakpoint():
|
||||
"""
|
||||
If DO_BREAKPOINTS is switched on, this will pause program
|
||||
execution and wait for the user to press enter to continue.
|
||||
"""
|
||||
if config.obj.DO_BREAKPOINTS:
|
||||
input('Breakpoint here. <Enter> to continue...')
|
||||
from . import log
|
||||
from . import util
|
||||
from .assets.languages import LANGUAGES
|
||||
|
||||
class PageRootObject:
|
||||
"""
|
||||
@ -28,7 +22,7 @@ class PageRootObject:
|
||||
In here are some re-used methods to click links and highlight
|
||||
elements in the browser.
|
||||
"""
|
||||
def __init__(self, driver=None):
|
||||
def __init__(self, driver):
|
||||
"""
|
||||
Object constructor for initializing the instance of this
|
||||
class with internal variables needed.
|
||||
@ -37,9 +31,6 @@ class PageRootObject:
|
||||
driver: Reference to the selenium webdriver object
|
||||
that is used to interface with the web browser.
|
||||
"""
|
||||
if not driver:
|
||||
self.driver = browser.create_webdriver()
|
||||
else:
|
||||
self.driver = driver
|
||||
|
||||
def click(self, el):
|
||||
@ -49,11 +40,11 @@ class PageRootObject:
|
||||
|
||||
Args:
|
||||
el: selenium element to be clicked. Typically an anchor
|
||||
html link in the page.
|
||||
html link in the webpage.
|
||||
"""
|
||||
self.highlight(el, 'red')
|
||||
time.sleep(config.obj.PAGE_DELAY)
|
||||
breakpoint()
|
||||
util.breakpoint()
|
||||
el.click()
|
||||
|
||||
def highlight(self, el, color):
|
||||
@ -68,36 +59,55 @@ class PageRootObject:
|
||||
color: background color to highlight. Input can be one of
|
||||
'red', 'blue', or hex code such as '#ffffff'.
|
||||
"""
|
||||
# Note: The way hex codes work is there are 1 byte (2 hex characters)
|
||||
# Note: The way hex codes work is there is 1 byte (2 hex characters)
|
||||
# for every color. #RRGGBB for (red, green, blue). This can be thought
|
||||
# of as an integer 0-255 for red, green, and blue in base-16 hexadecimal.
|
||||
# For example, #ff0000 is bright red while #002f00 is light green
|
||||
# and #ffff00 is full yellow.
|
||||
if color == 'red':
|
||||
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292'
|
||||
color = '#ff9292'
|
||||
# js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292'
|
||||
elif color == 'blue':
|
||||
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
|
||||
color = '#9292ff'
|
||||
# js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
|
||||
else:
|
||||
# color = color
|
||||
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
|
||||
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
|
||||
self.driver.execute_script(js, el)
|
||||
|
||||
# Note: This is the syntax for class inheritance. LandingPage is a new type of object that inherits
|
||||
# everything from the PageRootObject type. With this, you can call LandingPage.highlight() which
|
||||
# is a method defined in PageRootObject.
|
||||
class LandingPage(PageRootObject):
|
||||
"""
|
||||
Interface for working with the wikipedia.org landing page. This page has links to
|
||||
select a language and go to the respective wikipedia root page.
|
||||
"""
|
||||
|
||||
# Note: This is the LandingPage() object constructor. All it does right now is
|
||||
# Note: This is the LandingPage() class constructor. The constructor is a method
|
||||
# that is executed when a new object of this class is created. All it does right now is
|
||||
# reference the parent (PageRootObject) constructor method and call it. This
|
||||
# calls PageRootObject.__init__(driver) which makes the web driver available
|
||||
# calls PageRootObject.__init__(driver) which then makes the web driver available
|
||||
# in the object instance.
|
||||
def __init__(self, driver=None):
|
||||
super().__init__(driver)
|
||||
|
||||
def goto_landing_page(self):
|
||||
"""
|
||||
Navigates the browser to www.wikipedia.org
|
||||
"""
|
||||
self.driver.get(config.obj.PAGE_BASE_URL)
|
||||
|
||||
def select_language(self, language):
|
||||
link = self.driver.find_element_by_partial_link_text(language)
|
||||
lang_text = LANGUAGES.get(language)
|
||||
try:
|
||||
link = self.driver.find_element_by_partial_link_text(lang_text)
|
||||
self.click(link)
|
||||
return True
|
||||
except selenium.common.exceptions.NoSuchElementException as ex:
|
||||
logging.warn('failed to find language: %s as %s' % (language, lang_text))
|
||||
return False
|
||||
|
||||
class MainPage(PageRootObject):
|
||||
"""
|
||||
@ -121,7 +131,7 @@ class ArticlePage(PageRootObject):
|
||||
# These are used to locate html elements in the web browser. There are many
|
||||
# ways to locate elements but one of the best if available is locating by id. It's
|
||||
# not enforced but the html specification mandates that element id's are unique
|
||||
# so if you can select by id in a semanticly correct web page, you can correctly
|
||||
# so if you can select by id in a semantically correct web page, you can correctly
|
||||
# select unique elements with high confidence.
|
||||
elements = {
|
||||
'main-window-content-text-id': 'mw-content-text',
|
||||
@ -132,10 +142,18 @@ class ArticlePage(PageRootObject):
|
||||
super().__init__(driver)
|
||||
|
||||
def get_title(self):
|
||||
"""
|
||||
Returns the article title.
|
||||
"""
|
||||
heading = self.driver.find_element_by_id(ArticlePage.elements['article-title'])
|
||||
return heading.text
|
||||
|
||||
def click_first_link(self):
|
||||
"""
|
||||
Attempts to click the first valid link in the article. Some work is
|
||||
done to skip over certain links but the implementation breaks in some
|
||||
edge cases. It's close but not perfect for every article text.
|
||||
"""
|
||||
return self._iterate_paragraphs()
|
||||
|
||||
# Note: Here this method has it's name prepended with a single underscore.
|
||||
@ -159,33 +177,51 @@ class ArticlePage(PageRootObject):
|
||||
main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id'])
|
||||
paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]')
|
||||
for p in paragraphs:
|
||||
# Return code indicates the success status of _parse_paragraph().
|
||||
# In this case, an rc of True means that it was able to find a
|
||||
# link and we stop going through paragraphs.
|
||||
rc = self._parse_paragraph(p)
|
||||
if rc:
|
||||
return True
|
||||
|
||||
def _parse_paragraph(self, p):
|
||||
"""
|
||||
Attempts to find a valid link in the paragraph element sent in.
|
||||
|
||||
Args:
|
||||
p: Reference to selenium paragraph element. This is a paragraph
|
||||
taken from the article.
|
||||
"""
|
||||
links = p.find_elements_by_xpath('.//a')
|
||||
if len(links) == 0:
|
||||
return False
|
||||
for link in links:
|
||||
logging.debug('processing link: %s' % link.text)
|
||||
log.LOGGER['pages'].debug('processing link: %s' % link.text)
|
||||
if not self._is_valid_link(p, link):
|
||||
logging.debug('skipping link inside parenthesis: %s' % link.text)
|
||||
log.LOGGER['pages'].debug('skipping link inside parenthesis: %s' % link.text)
|
||||
self.highlight(link, 'blue')
|
||||
continue
|
||||
self.highlight(link, 'red')
|
||||
logging.info('selected link: %s' % link.text)
|
||||
log.LOGGER['pages'].info('selected link: %s' % link.text)
|
||||
self.click(link)
|
||||
return True
|
||||
|
||||
def _is_valid_link(self, p, el):
|
||||
a = self._is_link_in_parenthesis(p, el)
|
||||
b = self._is_link_a_footnote(el)
|
||||
c = self._is_link_pronounciation(el)
|
||||
d = self._is_link_audio(el)
|
||||
if not a and not b and not c and not d:
|
||||
return True
|
||||
"""
|
||||
Returns if the implementation decides to skip this link. You can
|
||||
see the reasons we invalidate and skip a link here. If it's
|
||||
inside parenthesis, is a footnote, is a pronounciation guide or
|
||||
audio link, we choose to skip it.
|
||||
"""
|
||||
if self._is_link_in_parenthesis(p, el):
|
||||
return False
|
||||
if self._is_link_a_footnote(el):
|
||||
return False
|
||||
if self._is_link_pronounciation(el):
|
||||
return False
|
||||
if self._is_link_audio(el):
|
||||
return False
|
||||
return True
|
||||
|
||||
def _is_link_in_parenthesis(self, p, el):
|
||||
"""
|
||||
@ -198,7 +234,7 @@ class ArticlePage(PageRootObject):
|
||||
# certain links and usually avoid links inside parenthetical
|
||||
# notes. Some edge cases are nested parenthesis, links with
|
||||
# non-english characters (which are displayed with a tree
|
||||
# of elements in the html rather than a simply link). And
|
||||
# of elements in the html rather than a simple link). And
|
||||
# sometimes, the link inside the parenthesis may be a valid
|
||||
# target. I've made it so that skipped links show up as blue
|
||||
# and determined-valid links highlight as red.
|
23
wikicrawl/util.py
Normal file
23
wikicrawl/util.py
Normal file
@ -0,0 +1,23 @@
|
||||
# util module contains utility functions that can be common or shared
|
||||
# between the other modules.
|
||||
|
||||
import yandex_translate
|
||||
|
||||
from . import config
|
||||
|
||||
def breakpoint():
|
||||
"""
|
||||
If DO_BREAKPOINTS is switched on, this will pause program
|
||||
execution and wait for the user to press enter to continue.
|
||||
"""
|
||||
if config.obj.DO_BREAKPOINTS:
|
||||
input('BREAKPOINT hit. <Enter> to continue...')
|
||||
|
||||
def translate_text(source_language, target_language, text):
|
||||
translate = yandex_translate.YandexTranslate(config.obj.YANDEX_API_KEY)
|
||||
if not source_language:
|
||||
source_language = translate.detect(text)
|
||||
lang_direction = '%s-%s' % (source_language, target_language)
|
||||
result = translate.translate(text, lang_direction)
|
||||
return result['text'][0]
|
||||
|
Loading…
Reference in New Issue
Block a user