mirror of
https://git.zavage.net/Zavage-Software/wikicrawl.git
synced 2024-11-24 00:59:19 -07:00
colored logging, multiple selenium drivers, multi language support
This commit is contained in:
parent
5d88690ded
commit
4aa965cfc8
@ -1,12 +0,0 @@
|
|||||||
# The __init__.py file signals to the python interpreter that the
|
|
||||||
# app directory is a package. A package is a special module that
|
|
||||||
# contains other modules. Each file is a module (browser, cli, etc.)
|
|
||||||
# and the "app" package is a module that contains other modules.
|
|
||||||
|
|
||||||
# The "app" module exports the stuff exposed here. We export
|
|
||||||
# app.init() as a reference to app.config.init() and app.main
|
|
||||||
# as a reference to app.cli.main
|
|
||||||
|
|
||||||
from .config import init
|
|
||||||
from .cli import main
|
|
||||||
|
|
@ -1,7 +0,0 @@
|
|||||||
import logging
|
|
||||||
|
|
||||||
from . import config
|
|
||||||
|
|
||||||
def init_logging():
|
|
||||||
logging.basicConfig(level=config.obj.LOG_LEVEL)
|
|
||||||
|
|
@ -1,9 +1,9 @@
|
|||||||
import app
|
import wikicrawl
|
||||||
import settings
|
import settings
|
||||||
|
|
||||||
# Inject the settings.DefaultSettings object into the
|
# Inject the settings.DefaultSettings object into the
|
||||||
# app and start running the program.
|
# app and start running the program.
|
||||||
app.init(settings.DefaultSettings)
|
wikicrawl.init(settings.DefaultSettings)
|
||||||
app.main()
|
wikicrawl.main()
|
||||||
input('<enter> to exit')
|
input('<enter> to exit')
|
||||||
|
|
||||||
|
99
settings.py
99
settings.py
@ -4,28 +4,107 @@
|
|||||||
# not hard-coded into the application. For example, some users may want
|
# not hard-coded into the application. For example, some users may want
|
||||||
# to run this program in English while others may want to run in Spanish.
|
# to run this program in English while others may want to run in Spanish.
|
||||||
# The way this works is we specify those variables external from the
|
# The way this works is we specify those variables external from the
|
||||||
# application (here) and pass them into the application (app.config module).
|
# application (here) and pass them into the application (wikicrawl.config module).
|
||||||
# The application then references app.config.obj to access the variables
|
# The application then references wikicrawl.config.obj to access the variables
|
||||||
# passed in from here.
|
# passed in from here.
|
||||||
|
|
||||||
|
import colorlog
|
||||||
import logging
|
import logging
|
||||||
|
import logging.config
|
||||||
|
|
||||||
class DefaultSettings:
|
class DefaultSettings:
|
||||||
|
# Filepath parameters - THESE MUST EXIST OR PROGRAM WILL NOT RUN!!
|
||||||
|
LOG_FILENAME = '/tmp/wikicrawl.log'
|
||||||
|
SQLITE_DBFILE = '/home/mathew/.wikicrawler.db'
|
||||||
|
|
||||||
# Application Parameters
|
# Application Parameters
|
||||||
LOG_LEVEL = logging.INFO
|
DO_BREAKPOINTS = False
|
||||||
DO_BREAKPOINTS = True
|
|
||||||
PAGE_DELAY = 0
|
PAGE_DELAY = 0
|
||||||
|
|
||||||
# Web Driver Parameters
|
# Web Driver Parameters
|
||||||
WEBDRIVER_USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)'
|
WEBDRIVER_USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)'
|
||||||
WEBDRIVER_BROWSER = 'chrome' # Options are 'chrome', 'firefox'
|
|
||||||
|
# Requested browser and webdriver dependencies are required for this to work.
|
||||||
|
# This means you need to have installed on your system:
|
||||||
|
# Chrome + WebDriver for Chrome
|
||||||
|
# Firefox + geckodriver for Firefox
|
||||||
|
# phantomjs for phantom
|
||||||
|
WEBDRIVER_BROWSER = 'chrome' # Options are 'chrome', 'firefox', 'phantom'
|
||||||
|
|
||||||
# Wikipedia Parameters
|
# Wikipedia Parameters
|
||||||
PAGE_BASE_URL = 'https://www.wikipedia.org/'
|
PAGE_BASE_URL = 'https://www.wikipedia.org/'
|
||||||
PAGE_LANGUAGE = 'English'
|
|
||||||
# PAGE_LANGUAGE = 'Español'
|
|
||||||
# PAGE_LANGUAGE = 'Русский'
|
|
||||||
|
|
||||||
# Data Layer Parameters
|
# Supported Languages so far:
|
||||||
SQLITE_DBFILE = '/home/mathew/.wikicrawler.db'
|
# German, English, Spanish, French, Italian, Portuguese, Polish, Russian
|
||||||
|
# 'de', 'en', 'es', 'fr', 'it', 'pl', 'pt', 'ru'
|
||||||
|
PAGE_LANGUAGE = 'en'
|
||||||
|
|
||||||
|
# API Keys
|
||||||
|
YANDEX_API_KEY = 'trnsl.1.1.20170825T194642Z.26862b9dd4c1a755.9490ed28de448ff67522c2854f262eff05ec0dc3'
|
||||||
|
|
||||||
|
# Logging Parameters
|
||||||
|
LOG_SETTINGS = {
|
||||||
|
'version': 1, # version is always 1
|
||||||
|
'formatters': {
|
||||||
|
'colored': {
|
||||||
|
'()': 'colorlog.ColoredFormatter',
|
||||||
|
'format': '%(log_color)s%(levelname)-8s%(reset)s:%(log_color)s%(name)-5s%(reset)s:%(blue)s%(message)s'
|
||||||
|
},
|
||||||
|
'basic': {
|
||||||
|
'()': 'logging.Formatter',
|
||||||
|
'format': '%(levelname)s:%(name)s:%(asctime)s:%(message)s'
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'handlers': {
|
||||||
|
'stderr': {
|
||||||
|
'class': 'logging.StreamHandler',
|
||||||
|
# The handler level will override the logger level if higher.
|
||||||
|
# That is, if the logger level is set to pass through DEBUG
|
||||||
|
# and higher and the handler is set to only pass through WARNING
|
||||||
|
# and higher, DEBUG messages will not pass through to this loggers
|
||||||
|
# handler. You can configure multiple handlers for any logger so
|
||||||
|
# for example you could log WARNINGS and ERRORS to a file but
|
||||||
|
# not save all the DEBUG messages.
|
||||||
|
'level': logging.DEBUG,
|
||||||
|
'formatter': 'colored'
|
||||||
|
},
|
||||||
|
'file': {
|
||||||
|
'class': 'logging.handlers.RotatingFileHandler',
|
||||||
|
'level': logging.INFO,
|
||||||
|
'formatter': 'basic',
|
||||||
|
'filename': LOG_FILENAME,
|
||||||
|
'maxBytes': 32768,
|
||||||
|
'backupCount': 3
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'loggers': {
|
||||||
|
# Root Logger
|
||||||
|
'': {
|
||||||
|
'level': logging.DEBUG,
|
||||||
|
'handlers': ['file'],
|
||||||
|
},
|
||||||
|
'main': {
|
||||||
|
'level': logging.DEBUG,
|
||||||
|
'handlers': ['stderr'],
|
||||||
|
'propagate': False
|
||||||
|
},
|
||||||
|
'model': {
|
||||||
|
'level': logging.DEBUG,
|
||||||
|
'handlers': ['stderr'],
|
||||||
|
'propagate': True
|
||||||
|
},
|
||||||
|
'cli': {
|
||||||
|
'level': logging.DEBUG,
|
||||||
|
'handlers': ['stderr'],
|
||||||
|
'propagate': False
|
||||||
|
},
|
||||||
|
'pages': {
|
||||||
|
'level': logging.INFO,
|
||||||
|
'handlers': ['stderr'],
|
||||||
|
'propagate': False
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
23
setup.py
23
setup.py
@ -0,0 +1,23 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# setup.py is the install script for this application. This will download
|
||||||
|
# required third-party dependencies and package the app. You can also
|
||||||
|
# install the application system-wide.
|
||||||
|
|
||||||
|
from setuptools import setup
|
||||||
|
|
||||||
|
__project__ = 'wikicrawl'
|
||||||
|
# If you're looking for a versioning scheme, one revered pattern
|
||||||
|
# can be read about at http://semver.org
|
||||||
|
__version__ = '0.9.0'
|
||||||
|
|
||||||
|
setup(name = __project__,
|
||||||
|
version = __version__,
|
||||||
|
description = '',
|
||||||
|
author = '',
|
||||||
|
author_email = '',
|
||||||
|
url = '',
|
||||||
|
install_requires = ('yandex.translate',
|
||||||
|
'selenium',
|
||||||
|
),
|
||||||
|
packages = ('wikicrawl',))
|
||||||
|
|
BIN
wikicrawl/.dal.py.swp
Normal file
BIN
wikicrawl/.dal.py.swp
Normal file
Binary file not shown.
12
wikicrawl/__init__.py
Normal file
12
wikicrawl/__init__.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
# The __init__.py file signals to the python interpreter that the
|
||||||
|
# app directory is a package. A package is a special module that
|
||||||
|
# contains other modules. Each file is a module (browser, cli, etc.)
|
||||||
|
# and the "wikicrawl" package is a module that contains other modules.
|
||||||
|
|
||||||
|
# The wikicrawl package, which is a module, exports the stuff exposed here.
|
||||||
|
# We export config.init() as a reference to wikicrawl.config.init() and
|
||||||
|
# wikicrawl.main as a reference to wikicrawl.cli.main
|
||||||
|
|
||||||
|
from .config import init
|
||||||
|
from .main import main
|
||||||
|
|
39
wikicrawl/assets/languages.py
Normal file
39
wikicrawl/assets/languages.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
|
||||||
|
LANGUAGES = {
|
||||||
|
'az': '',
|
||||||
|
'be': '',
|
||||||
|
'bg': '',
|
||||||
|
'ca': '',
|
||||||
|
'cs': '',
|
||||||
|
'da': '',
|
||||||
|
'de': 'Deutsch',
|
||||||
|
'el': '',
|
||||||
|
'en': 'English',
|
||||||
|
'es': 'Español',
|
||||||
|
'et': '',
|
||||||
|
'fi': '',
|
||||||
|
'fr': 'Français',
|
||||||
|
'hr': '',
|
||||||
|
'hu': '',
|
||||||
|
'hy': '',
|
||||||
|
'it': 'Italiano',
|
||||||
|
# 'ja': '日本語', -- no japanese in yandex
|
||||||
|
'lt': '',
|
||||||
|
'lv': '',
|
||||||
|
'mk': '',
|
||||||
|
'nl': '',
|
||||||
|
'no': '',
|
||||||
|
'pl': 'Polski',
|
||||||
|
'pt': 'Português',
|
||||||
|
'ro': '',
|
||||||
|
'ru': 'Русский',
|
||||||
|
'sk': '',
|
||||||
|
'sl': '',
|
||||||
|
'sq': '',
|
||||||
|
'sr': '',
|
||||||
|
'sv': '',
|
||||||
|
'tr': '',
|
||||||
|
'uk': '',
|
||||||
|
# 'zh': '中文' -- no chinese
|
||||||
|
}
|
||||||
|
|
@ -11,9 +11,9 @@
|
|||||||
|
|
||||||
import selenium
|
import selenium
|
||||||
import selenium.webdriver
|
import selenium.webdriver
|
||||||
import logging
|
|
||||||
|
|
||||||
from . import config
|
from . import config
|
||||||
|
from . import log
|
||||||
|
|
||||||
# This function has a parameter (driver) that passes in a value. In this case,
|
# This function has a parameter (driver) that passes in a value. In this case,
|
||||||
# this driver variable defaults to the string 'chrome'. The code can call
|
# this driver variable defaults to the string 'chrome'. The code can call
|
||||||
@ -25,9 +25,17 @@ def create_webdriver(driver='chrome'):
|
|||||||
return create_webdriver_chrome()
|
return create_webdriver_chrome()
|
||||||
elif driver == 'firefox':
|
elif driver == 'firefox':
|
||||||
return create_webdriver_firefox()
|
return create_webdriver_firefox()
|
||||||
|
elif driver == 'phantom':
|
||||||
|
return create_webdriver_phantom()
|
||||||
|
else:
|
||||||
|
log.LOGGER('browser').error('unable to handle webdriver request: %s' % driver)
|
||||||
|
return
|
||||||
|
|
||||||
def create_webdriver_firefox():
|
def create_webdriver_firefox():
|
||||||
pass
|
profile = selenium.webdriver.FirefoxProfile()
|
||||||
|
profile.set_preference("general.useragent.override", config.obj.WEBDRIVER_USER_AGENT)
|
||||||
|
driver = selenium.webdriver.Firefox(profile)
|
||||||
|
return driver
|
||||||
|
|
||||||
def create_webdriver_chrome():
|
def create_webdriver_chrome():
|
||||||
opt = selenium.webdriver.chrome.options.Options()
|
opt = selenium.webdriver.chrome.options.Options()
|
||||||
@ -35,3 +43,7 @@ def create_webdriver_chrome():
|
|||||||
driver = selenium.webdriver.Chrome(chrome_options = opt)
|
driver = selenium.webdriver.Chrome(chrome_options = opt)
|
||||||
return driver
|
return driver
|
||||||
|
|
||||||
|
def create_webdriver_phantom():
|
||||||
|
driver = selenium.webdriver.PhantomJS()
|
||||||
|
return driver
|
||||||
|
|
@ -1,37 +1,42 @@
|
|||||||
#!/usr/bin/env python
|
# The command-line interface module creates an interface for
|
||||||
# The command-line interface module creates a interface for
|
|
||||||
# interacting with the python program (wikicrawl). This is an implementation
|
# interacting with the python program (wikicrawl). This is an implementation
|
||||||
# of the baker demo shown previously. The user can type in commands to
|
# of the baker demo shown previously. The user can type in commands to
|
||||||
# make the program do things.
|
# make the program do things.
|
||||||
|
|
||||||
import baker
|
import baker
|
||||||
import logging
|
|
||||||
import readline # Needed for command history <up> and <down> arrows to work
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
if sys.platform == 'linux':
|
||||||
|
import readline # Needed for command history <up> and <down> arrows to work
|
||||||
|
|
||||||
|
from . import log
|
||||||
from . import model
|
from . import model
|
||||||
from . import config
|
from . import config
|
||||||
|
|
||||||
# Problem pages:
|
# Problem pages:
|
||||||
# Decision (from politics)
|
# Decision (from politics)
|
||||||
# Malaysia (goes inside parenthesis)
|
# Malaysia (goes inside parenthesis)
|
||||||
|
# Soft-sediment_deformation_structures (doesn't find link)
|
||||||
|
# Chemicals (loops at philosophical)
|
||||||
|
|
||||||
commander = baker.Baker()
|
commander = baker.Baker()
|
||||||
|
|
||||||
def main():
|
|
||||||
user_interface = InteractiveInterface()
|
|
||||||
|
|
||||||
if len(sys.argv) > 1: # Command line arguments were passed in
|
|
||||||
# command-line when invoking python
|
|
||||||
user_interface.run(sys.argv)
|
|
||||||
else:
|
|
||||||
user_interface.start_command_loop()
|
|
||||||
|
|
||||||
class InteractiveInterface:
|
class InteractiveInterface:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
# Instantiate the variable self.model as an object
|
||||||
|
# of instance of the Model class defined in the model
|
||||||
|
# module. model.Model refers to the Model class in the
|
||||||
|
# model module and this line creates a new variable (self.model)
|
||||||
|
# which is a variable that is an instance of Model, i.e.
|
||||||
|
# it has the type Model and has Model.methods() available
|
||||||
|
# to it.
|
||||||
|
#
|
||||||
|
# self.model is a variable that is attached to the instance/object
|
||||||
|
# returned by this constructor that has the type InteractiveInterface.
|
||||||
self.model = model.Model()
|
self.model = model.Model()
|
||||||
|
|
||||||
def run(self, args, main=True):
|
def run_command(self, args, main=True):
|
||||||
"""
|
"""
|
||||||
Runs the command-line interface for a single command.
|
Runs the command-line interface for a single command.
|
||||||
|
|
||||||
@ -45,13 +50,13 @@ class InteractiveInterface:
|
|||||||
commander.run(argv=args, main=True, help_on_error=True,
|
commander.run(argv=args, main=True, help_on_error=True,
|
||||||
instance=self)
|
instance=self)
|
||||||
except baker.CommandError as ex:
|
except baker.CommandError as ex:
|
||||||
logging.warn('incorrect user input: %s' % ex)
|
log.LOGGER['cli'].warn('incorrect user input: %s' % ex)
|
||||||
commander.usage()
|
commander.usage()
|
||||||
except baker.TopHelp as ex:
|
except baker.TopHelp as ex:
|
||||||
commander.usage()
|
commander.usage()
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
logging.error('caught general exception!!')
|
log.LOGGER['cli'].error('caught general exception!!')
|
||||||
print(type(ex), ex)
|
log.LOGGER['cli'].error(type(ex), ex)
|
||||||
|
|
||||||
def start_command_loop(self):
|
def start_command_loop(self):
|
||||||
"""
|
"""
|
||||||
@ -76,12 +81,13 @@ class InteractiveInterface:
|
|||||||
# to NOT drop to a newline after printing
|
# to NOT drop to a newline after printing
|
||||||
# in the terminal. Instead, let the user
|
# in the terminal. Instead, let the user
|
||||||
# type their command on the same line as
|
# type their command on the same line as
|
||||||
# our printed '$ '.
|
# the printed '$ '.
|
||||||
try:
|
try:
|
||||||
inp = input()
|
inp = input()
|
||||||
except EOFError: # <ctrl>+D will send "End Line" and exit the command loop
|
except EOFError: # <ctrl>+D will send "End Line" and exit the command loop
|
||||||
break
|
break
|
||||||
# Note in arguments (mg):
|
|
||||||
|
# Note on "arguments" (mg):
|
||||||
# Whenever a program is run in windows or *nix, the operating
|
# Whenever a program is run in windows or *nix, the operating
|
||||||
# system passes in the command string that was used to invoke
|
# system passes in the command string that was used to invoke
|
||||||
# the program. You can append data in that command to configure
|
# the program. You can append data in that command to configure
|
||||||
@ -91,16 +97,16 @@ class InteractiveInterface:
|
|||||||
# software but you can also pass in an argument. You can
|
# software but you can also pass in an argument. You can
|
||||||
# alternatively run "python launcher.py <argument> <argument>..."
|
# alternatively run "python launcher.py <argument> <argument>..."
|
||||||
# and the operating system will provide the <argument> values into
|
# and the operating system will provide the <argument> values into
|
||||||
# the process that is running.
|
# the process that is running as variables.
|
||||||
#
|
#
|
||||||
# In a real world use case, many commands provide switches to
|
# In a real world use case, many commands provide switches to
|
||||||
# adjust what the program does. For example,
|
# adjust what the program does. For example,
|
||||||
#
|
#
|
||||||
# The command:
|
# The command:
|
||||||
# find music -iname "*justin*bieber*"
|
# find music -name "*justin*bieber*"
|
||||||
# runs the "find" program and asks to find all the filenames that match the
|
# runs the "find" program and asks to find all the filenames that match the
|
||||||
# pattern *justin*bieber* in the "music" directory.
|
# pattern *justin*bieber* in the "music" directory.
|
||||||
# (music, -iname, "*justin*biever*") are argument parameters
|
# (music, -name, "*justin*biever*") are argument parameters
|
||||||
# that are passed into the program. The program is coded to
|
# that are passed into the program. The program is coded to
|
||||||
# parse and interpret these values and execute differently based
|
# parse and interpret these values and execute differently based
|
||||||
# on the values passed in. This is one way to pass in information
|
# on the values passed in. This is one way to pass in information
|
||||||
@ -123,14 +129,21 @@ class InteractiveInterface:
|
|||||||
# would be C:\Users\mguest\launcher.py.
|
# would be C:\Users\mguest\launcher.py.
|
||||||
|
|
||||||
# What this method (start_command_loop()) does is provide a
|
# What this method (start_command_loop()) does is provide a
|
||||||
# REPL which is a
|
# REPL shell which is a
|
||||||
# read-eval-print-loop. It repeatedly asks the user for an
|
# read-eval-print-loop. It repeatedly asks the user for an
|
||||||
# input (read), evaluates that input into an action (evaluate),
|
# input (read), evaluates that input into an action (evaluate),
|
||||||
# give the user some feedback (print), and start the process
|
# give the user some feedback (print), and start the process
|
||||||
# over again (loop). When you call "python", you are given a python
|
# over again (loop). When you call just "python", you are loading a
|
||||||
# process that gives you a REPL interactive shell. The way
|
# program that gives you a REPL interactive shell. The way
|
||||||
# this wikicrawl app is implemented gives the user a REPL
|
# this wikicrawl app is implemented gives the user a REPL
|
||||||
# that has commands to interact with wikipedia pages.
|
# that has commands to interact with wikipedia pages.
|
||||||
|
|
||||||
|
# Because we take in the input as a single string, we do
|
||||||
|
# a transformation to turn something like "do_random_page 5"
|
||||||
|
# into ["launcher.py", "do_random_page", "5"] which is how
|
||||||
|
# the arguments array would have been created if it were
|
||||||
|
# passed in the initial command instead of typed and interpretted
|
||||||
|
# as input as is done here.
|
||||||
args = [sys.argv[0], ] + inp.split()
|
args = [sys.argv[0], ] + inp.split()
|
||||||
|
|
||||||
# The user can at any point in the command pass the argument
|
# The user can at any point in the command pass the argument
|
||||||
@ -146,40 +159,42 @@ class InteractiveInterface:
|
|||||||
# python launcher.py do_random_page --help
|
# python launcher.py do_random_page --help
|
||||||
# You will see the program spit out the heredoc below the
|
# You will see the program spit out the heredoc below the
|
||||||
# do_random_page method defined below.
|
# do_random_page method defined below.
|
||||||
|
|
||||||
if '--help' in args:
|
if '--help' in args:
|
||||||
args.remove('--help')
|
args.remove('--help')
|
||||||
try:
|
try:
|
||||||
print('command usage:')
|
print('command usage:')
|
||||||
commander.usage(args[1])
|
commander.usage(args[1])
|
||||||
return
|
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
print(type(ex), ex)
|
print(type(ex), ex)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
self.run(args, main=False)
|
self.run_command(args, main=False)
|
||||||
|
|
||||||
@commander.command
|
@commander.command
|
||||||
def do_random_page(self):
|
def play_random_page(self):
|
||||||
"""
|
"""
|
||||||
Instructs the wikicrawl application to play the game on a random
|
Instructs the wikicrawl application to play the game on a random
|
||||||
article.
|
article.
|
||||||
"""
|
"""
|
||||||
self.model.do_random_page()
|
self.model.play_random_page()
|
||||||
|
|
||||||
@commander.command
|
@commander.command
|
||||||
def do_n_pages(self, n):
|
def play_multiple(self, n):
|
||||||
"""
|
"""
|
||||||
Plays the wikicrawl game <n>-times.
|
Plays the wikicrawl game <n>-times.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
n = int(n)
|
n = int(n)
|
||||||
except ValueError as ex:
|
except ValueError as ex:
|
||||||
logging.warn('failed to process "%s" as a parameter' % n)
|
log.LOGGER['cli'].warn('failed to process "%s" as a parameter' % n)
|
||||||
return False
|
return False
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
self.model.do_random_page()
|
self.model.play_random_page()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
@commander.command
|
||||||
main()
|
def exit(self):
|
||||||
|
"""
|
||||||
|
Immediately exit the program.
|
||||||
|
"""
|
||||||
|
sys.exit(0)
|
||||||
|
|
33
wikicrawl/log.py
Normal file
33
wikicrawl/log.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
# log module is a wrapper around third-party colorlog library
|
||||||
|
# and provides an application-level interface to a logging system.
|
||||||
|
|
||||||
|
import colorlog
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from . import config
|
||||||
|
|
||||||
|
# Default python log severity levels:
|
||||||
|
# CRITICAL
|
||||||
|
# ERROR
|
||||||
|
# WARNING
|
||||||
|
# INFO
|
||||||
|
# DEBUG
|
||||||
|
|
||||||
|
LOGGER = None
|
||||||
|
|
||||||
|
class LoggingLayer:
|
||||||
|
def __init__(self, config):
|
||||||
|
self.loggers = {}
|
||||||
|
logging.config.dictConfig(config)
|
||||||
|
|
||||||
|
def __getitem__(self, k):
|
||||||
|
logger = self.loggers.get(k)
|
||||||
|
if not logger:
|
||||||
|
logger = logging.getLogger(k)
|
||||||
|
self.loggers[k] = logger
|
||||||
|
return logger
|
||||||
|
|
||||||
|
def init_logging():
|
||||||
|
global LOGGER
|
||||||
|
LOGGER = LoggingLayer(config.obj.LOG_SETTINGS)
|
||||||
|
|
19
wikicrawl/main.py
Normal file
19
wikicrawl/main.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from . import cli
|
||||||
|
from . import util
|
||||||
|
|
||||||
|
def main():
|
||||||
|
user_interface = cli.InteractiveInterface()
|
||||||
|
|
||||||
|
if len(sys.argv) > 1: # Command line arguments were passed in
|
||||||
|
# command-line when invoking python
|
||||||
|
user_interface.run_command(sys.argv)
|
||||||
|
else:
|
||||||
|
user_interface.start_command_loop()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
@ -6,7 +6,6 @@
|
|||||||
# to implement the wiki crawl. This is a separation of concerns
|
# to implement the wiki crawl. This is a separation of concerns
|
||||||
# and keeps the logic organized and separated.
|
# and keeps the logic organized and separated.
|
||||||
|
|
||||||
import logging
|
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
|
||||||
@ -15,10 +14,12 @@ from . import config
|
|||||||
from . import dal
|
from . import dal
|
||||||
from . import log
|
from . import log
|
||||||
from . import pages
|
from . import pages
|
||||||
|
from . import util
|
||||||
|
|
||||||
class Model:
|
class Model:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._webdriver = None
|
self._webdriver = None
|
||||||
|
self._translated_philosophy = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def webdriver(self):
|
def webdriver(self):
|
||||||
@ -37,6 +38,16 @@ class Model:
|
|||||||
page_api.goto_landing_page()
|
page_api.goto_landing_page()
|
||||||
return self._webdriver
|
return self._webdriver
|
||||||
|
|
||||||
|
@property
|
||||||
|
def translated_philosophy(self):
|
||||||
|
# This translates 'philosophy' to the target language with only 1 api call.
|
||||||
|
if config.obj.PAGE_LANGUAGE == 'en':
|
||||||
|
self._translated_philosophy = 'philosophy'
|
||||||
|
elif not self._translated_philosophy:
|
||||||
|
text = util.translate_text('en', config.obj.PAGE_LANGUAGE, 'philosophy')
|
||||||
|
self._translated_philosophy = text
|
||||||
|
return self._translated_philosophy
|
||||||
|
|
||||||
def open_browser(self):
|
def open_browser(self):
|
||||||
x = self.webdriver # Request the browser open immediately.
|
x = self.webdriver # Request the browser open immediately.
|
||||||
# Without this, the Model object will
|
# Without this, the Model object will
|
||||||
@ -48,7 +59,7 @@ class Model:
|
|||||||
# creates it and then it is re-used later
|
# creates it and then it is re-used later
|
||||||
# in the application.
|
# in the application.
|
||||||
|
|
||||||
def do_random_page(self):
|
def play_random_page(self):
|
||||||
"""
|
"""
|
||||||
Select a random page and repeatedly click the first link until
|
Select a random page and repeatedly click the first link until
|
||||||
we reach the article on philosophy. Sometimes, the driver encounters
|
we reach the article on philosophy. Sometimes, the driver encounters
|
||||||
@ -93,25 +104,50 @@ class Model:
|
|||||||
# the action we're trying to invoke.
|
# the action we're trying to invoke.
|
||||||
page_api.goto_random_article()
|
page_api.goto_random_article()
|
||||||
|
|
||||||
# Article page
|
# Article pages
|
||||||
pages_visited = []
|
pages_visited = []
|
||||||
|
|
||||||
|
# We just need translated_title to exist
|
||||||
|
translated_title = None
|
||||||
while True:
|
while True:
|
||||||
page_api = pages.ArticlePage(self.webdriver)
|
page_api = pages.ArticlePage(self.webdriver)
|
||||||
|
|
||||||
|
# Get the article title (and translate if necessary)
|
||||||
title = page_api.get_title()
|
title = page_api.get_title()
|
||||||
logging.debug('visited page: %s' % title)
|
if config.obj.PAGE_LANGUAGE != 'en':
|
||||||
|
translated_title = util.translate_text(config.obj.PAGE_LANGUAGE, 'en', title)
|
||||||
|
log.LOGGER['model'].info('visited page: %s (%s)' % (title, translated_title))
|
||||||
|
else:
|
||||||
|
log.LOGGER['model'].info('visited page: %s' % title)
|
||||||
|
|
||||||
|
# Check for page loops (have we already visisted this page?)
|
||||||
if title in pages_visited:
|
if title in pages_visited:
|
||||||
logging.info('encountered loop at page = %s' % title)
|
log.LOGGER['model'].info('encountered loop at page = %s' % title)
|
||||||
break
|
break
|
||||||
if title == 'Philosophy':
|
|
||||||
logging.info('made it to philosophy in %s pages' % len(pages_visited))
|
# Check if we reached the article on philosophy
|
||||||
|
if self._is_article_on_philosophy(title, translated_title):
|
||||||
|
log.LOGGER['model'].info('made it to philosophy in %s pages' % len(pages_visited))
|
||||||
pages_visited.append(title)
|
pages_visited.append(title)
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# Store the result of what articles have been navigated
|
||||||
pages_visited.append(title)
|
pages_visited.append(title)
|
||||||
|
|
||||||
rc = page_api.click_first_link()
|
rc = page_api.click_first_link()
|
||||||
if not rc:
|
if not rc:
|
||||||
logging.warn('failure: unable to continue (perhaps no valid links?)')
|
log.LOGGER['model'].warn('failure: unable to continue (perhaps no valid links?)')
|
||||||
break
|
break
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
def _is_article_on_philosophy(self, title, translated_title):
|
||||||
|
"""
|
||||||
|
Checks both the original title and the translated (to english) title to
|
||||||
|
see if they seem to be the page on philosophy.
|
||||||
|
"""
|
||||||
|
if title.lower() == self.translated_philosophy.lower():
|
||||||
|
return True
|
||||||
|
if translated_title and translated_title.lower() == 'philosophy':
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
@ -1,22 +1,16 @@
|
|||||||
# Pages module defines classes for interacting with wikipedia pages.
|
# pages module defines classes for interacting with wikipedia pages.
|
||||||
# There are separate classes defined for each page with their own
|
# There are separate classes defined for each page with their own
|
||||||
# defined methods for performing certain actions.
|
# defined methods for performing certain actions.
|
||||||
|
|
||||||
import logging
|
|
||||||
import re
|
import re
|
||||||
import selenium
|
import selenium
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from . import browser
|
from . import browser
|
||||||
from . import config
|
from . import config
|
||||||
|
from . import log
|
||||||
def breakpoint():
|
from . import util
|
||||||
"""
|
from .assets.languages import LANGUAGES
|
||||||
If DO_BREAKPOINTS is switched on, this will pause program
|
|
||||||
execution and wait for the user to press enter to continue.
|
|
||||||
"""
|
|
||||||
if config.obj.DO_BREAKPOINTS:
|
|
||||||
input('Breakpoint here. <Enter> to continue...')
|
|
||||||
|
|
||||||
class PageRootObject:
|
class PageRootObject:
|
||||||
"""
|
"""
|
||||||
@ -28,7 +22,7 @@ class PageRootObject:
|
|||||||
In here are some re-used methods to click links and highlight
|
In here are some re-used methods to click links and highlight
|
||||||
elements in the browser.
|
elements in the browser.
|
||||||
"""
|
"""
|
||||||
def __init__(self, driver=None):
|
def __init__(self, driver):
|
||||||
"""
|
"""
|
||||||
Object constructor for initializing the instance of this
|
Object constructor for initializing the instance of this
|
||||||
class with internal variables needed.
|
class with internal variables needed.
|
||||||
@ -37,9 +31,6 @@ class PageRootObject:
|
|||||||
driver: Reference to the selenium webdriver object
|
driver: Reference to the selenium webdriver object
|
||||||
that is used to interface with the web browser.
|
that is used to interface with the web browser.
|
||||||
"""
|
"""
|
||||||
if not driver:
|
|
||||||
self.driver = browser.create_webdriver()
|
|
||||||
else:
|
|
||||||
self.driver = driver
|
self.driver = driver
|
||||||
|
|
||||||
def click(self, el):
|
def click(self, el):
|
||||||
@ -49,11 +40,11 @@ class PageRootObject:
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
el: selenium element to be clicked. Typically an anchor
|
el: selenium element to be clicked. Typically an anchor
|
||||||
html link in the page.
|
html link in the webpage.
|
||||||
"""
|
"""
|
||||||
self.highlight(el, 'red')
|
self.highlight(el, 'red')
|
||||||
time.sleep(config.obj.PAGE_DELAY)
|
time.sleep(config.obj.PAGE_DELAY)
|
||||||
breakpoint()
|
util.breakpoint()
|
||||||
el.click()
|
el.click()
|
||||||
|
|
||||||
def highlight(self, el, color):
|
def highlight(self, el, color):
|
||||||
@ -68,36 +59,55 @@ class PageRootObject:
|
|||||||
color: background color to highlight. Input can be one of
|
color: background color to highlight. Input can be one of
|
||||||
'red', 'blue', or hex code such as '#ffffff'.
|
'red', 'blue', or hex code such as '#ffffff'.
|
||||||
"""
|
"""
|
||||||
# Note: The way hex codes work is there are 1 byte (2 hex characters)
|
# Note: The way hex codes work is there is 1 byte (2 hex characters)
|
||||||
# for every color. #RRGGBB for (red, green, blue). This can be thought
|
# for every color. #RRGGBB for (red, green, blue). This can be thought
|
||||||
# of as an integer 0-255 for red, green, and blue in base-16 hexadecimal.
|
# of as an integer 0-255 for red, green, and blue in base-16 hexadecimal.
|
||||||
|
# For example, #ff0000 is bright red while #002f00 is light green
|
||||||
|
# and #ffff00 is full yellow.
|
||||||
if color == 'red':
|
if color == 'red':
|
||||||
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292'
|
color = '#ff9292'
|
||||||
|
# js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292'
|
||||||
elif color == 'blue':
|
elif color == 'blue':
|
||||||
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
|
color = '#9292ff'
|
||||||
|
# js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
|
||||||
else:
|
else:
|
||||||
|
# color = color
|
||||||
|
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
|
||||||
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
|
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % color
|
||||||
self.driver.execute_script(js, el)
|
self.driver.execute_script(js, el)
|
||||||
|
|
||||||
|
# Note: This is the syntax for class inheritance. LandingPage is a new type of object that inherits
|
||||||
|
# everything from the PageRootObject type. With this, you can call LandingPage.highlight() which
|
||||||
|
# is a method defined in PageRootObject.
|
||||||
class LandingPage(PageRootObject):
|
class LandingPage(PageRootObject):
|
||||||
"""
|
"""
|
||||||
Interface for working with the wikipedia.org landing page. This page has links to
|
Interface for working with the wikipedia.org landing page. This page has links to
|
||||||
select a language and go to the respective wikipedia root page.
|
select a language and go to the respective wikipedia root page.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Note: This is the LandingPage() object constructor. All it does right now is
|
# Note: This is the LandingPage() class constructor. The constructor is a method
|
||||||
|
# that is executed when a new object of this class is created. All it does right now is
|
||||||
# reference the parent (PageRootObject) constructor method and call it. This
|
# reference the parent (PageRootObject) constructor method and call it. This
|
||||||
# calls PageRootObject.__init__(driver) which makes the web driver available
|
# calls PageRootObject.__init__(driver) which then makes the web driver available
|
||||||
# in the object instance.
|
# in the object instance.
|
||||||
def __init__(self, driver=None):
|
def __init__(self, driver=None):
|
||||||
super().__init__(driver)
|
super().__init__(driver)
|
||||||
|
|
||||||
def goto_landing_page(self):
|
def goto_landing_page(self):
|
||||||
|
"""
|
||||||
|
Navigates the browser to www.wikipedia.org
|
||||||
|
"""
|
||||||
self.driver.get(config.obj.PAGE_BASE_URL)
|
self.driver.get(config.obj.PAGE_BASE_URL)
|
||||||
|
|
||||||
def select_language(self, language):
|
def select_language(self, language):
|
||||||
link = self.driver.find_element_by_partial_link_text(language)
|
lang_text = LANGUAGES.get(language)
|
||||||
|
try:
|
||||||
|
link = self.driver.find_element_by_partial_link_text(lang_text)
|
||||||
self.click(link)
|
self.click(link)
|
||||||
|
return True
|
||||||
|
except selenium.common.exceptions.NoSuchElementException as ex:
|
||||||
|
logging.warn('failed to find language: %s as %s' % (language, lang_text))
|
||||||
|
return False
|
||||||
|
|
||||||
class MainPage(PageRootObject):
|
class MainPage(PageRootObject):
|
||||||
"""
|
"""
|
||||||
@ -121,7 +131,7 @@ class ArticlePage(PageRootObject):
|
|||||||
# These are used to locate html elements in the web browser. There are many
|
# These are used to locate html elements in the web browser. There are many
|
||||||
# ways to locate elements but one of the best if available is locating by id. It's
|
# ways to locate elements but one of the best if available is locating by id. It's
|
||||||
# not enforced but the html specification mandates that element id's are unique
|
# not enforced but the html specification mandates that element id's are unique
|
||||||
# so if you can select by id in a semanticly correct web page, you can correctly
|
# so if you can select by id in a semantically correct web page, you can correctly
|
||||||
# select unique elements with high confidence.
|
# select unique elements with high confidence.
|
||||||
elements = {
|
elements = {
|
||||||
'main-window-content-text-id': 'mw-content-text',
|
'main-window-content-text-id': 'mw-content-text',
|
||||||
@ -132,10 +142,18 @@ class ArticlePage(PageRootObject):
|
|||||||
super().__init__(driver)
|
super().__init__(driver)
|
||||||
|
|
||||||
def get_title(self):
|
def get_title(self):
|
||||||
|
"""
|
||||||
|
Returns the article title.
|
||||||
|
"""
|
||||||
heading = self.driver.find_element_by_id(ArticlePage.elements['article-title'])
|
heading = self.driver.find_element_by_id(ArticlePage.elements['article-title'])
|
||||||
return heading.text
|
return heading.text
|
||||||
|
|
||||||
def click_first_link(self):
|
def click_first_link(self):
|
||||||
|
"""
|
||||||
|
Attempts to click the first valid link in the article. Some work is
|
||||||
|
done to skip over certain links but the implementation breaks in some
|
||||||
|
edge cases. It's close but not perfect for every article text.
|
||||||
|
"""
|
||||||
return self._iterate_paragraphs()
|
return self._iterate_paragraphs()
|
||||||
|
|
||||||
# Note: Here this method has it's name prepended with a single underscore.
|
# Note: Here this method has it's name prepended with a single underscore.
|
||||||
@ -159,33 +177,51 @@ class ArticlePage(PageRootObject):
|
|||||||
main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id'])
|
main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id'])
|
||||||
paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]')
|
paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]')
|
||||||
for p in paragraphs:
|
for p in paragraphs:
|
||||||
|
# Return code indicates the success status of _parse_paragraph().
|
||||||
|
# In this case, an rc of True means that it was able to find a
|
||||||
|
# link and we stop going through paragraphs.
|
||||||
rc = self._parse_paragraph(p)
|
rc = self._parse_paragraph(p)
|
||||||
if rc:
|
if rc:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _parse_paragraph(self, p):
|
def _parse_paragraph(self, p):
|
||||||
|
"""
|
||||||
|
Attempts to find a valid link in the paragraph element sent in.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
p: Reference to selenium paragraph element. This is a paragraph
|
||||||
|
taken from the article.
|
||||||
|
"""
|
||||||
links = p.find_elements_by_xpath('.//a')
|
links = p.find_elements_by_xpath('.//a')
|
||||||
if len(links) == 0:
|
if len(links) == 0:
|
||||||
return False
|
return False
|
||||||
for link in links:
|
for link in links:
|
||||||
logging.debug('processing link: %s' % link.text)
|
log.LOGGER['pages'].debug('processing link: %s' % link.text)
|
||||||
if not self._is_valid_link(p, link):
|
if not self._is_valid_link(p, link):
|
||||||
logging.debug('skipping link inside parenthesis: %s' % link.text)
|
log.LOGGER['pages'].debug('skipping link inside parenthesis: %s' % link.text)
|
||||||
self.highlight(link, 'blue')
|
self.highlight(link, 'blue')
|
||||||
continue
|
continue
|
||||||
self.highlight(link, 'red')
|
self.highlight(link, 'red')
|
||||||
logging.info('selected link: %s' % link.text)
|
log.LOGGER['pages'].info('selected link: %s' % link.text)
|
||||||
self.click(link)
|
self.click(link)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _is_valid_link(self, p, el):
|
def _is_valid_link(self, p, el):
|
||||||
a = self._is_link_in_parenthesis(p, el)
|
"""
|
||||||
b = self._is_link_a_footnote(el)
|
Returns if the implementation decides to skip this link. You can
|
||||||
c = self._is_link_pronounciation(el)
|
see the reasons we invalidate and skip a link here. If it's
|
||||||
d = self._is_link_audio(el)
|
inside parenthesis, is a footnote, is a pronounciation guide or
|
||||||
if not a and not b and not c and not d:
|
audio link, we choose to skip it.
|
||||||
return True
|
"""
|
||||||
|
if self._is_link_in_parenthesis(p, el):
|
||||||
return False
|
return False
|
||||||
|
if self._is_link_a_footnote(el):
|
||||||
|
return False
|
||||||
|
if self._is_link_pronounciation(el):
|
||||||
|
return False
|
||||||
|
if self._is_link_audio(el):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
def _is_link_in_parenthesis(self, p, el):
|
def _is_link_in_parenthesis(self, p, el):
|
||||||
"""
|
"""
|
||||||
@ -198,7 +234,7 @@ class ArticlePage(PageRootObject):
|
|||||||
# certain links and usually avoid links inside parenthetical
|
# certain links and usually avoid links inside parenthetical
|
||||||
# notes. Some edge cases are nested parenthesis, links with
|
# notes. Some edge cases are nested parenthesis, links with
|
||||||
# non-english characters (which are displayed with a tree
|
# non-english characters (which are displayed with a tree
|
||||||
# of elements in the html rather than a simply link). And
|
# of elements in the html rather than a simple link). And
|
||||||
# sometimes, the link inside the parenthesis may be a valid
|
# sometimes, the link inside the parenthesis may be a valid
|
||||||
# target. I've made it so that skipped links show up as blue
|
# target. I've made it so that skipped links show up as blue
|
||||||
# and determined-valid links highlight as red.
|
# and determined-valid links highlight as red.
|
23
wikicrawl/util.py
Normal file
23
wikicrawl/util.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
# util module contains utility functions that can be common or shared
|
||||||
|
# between the other modules.
|
||||||
|
|
||||||
|
import yandex_translate
|
||||||
|
|
||||||
|
from . import config
|
||||||
|
|
||||||
|
def breakpoint():
|
||||||
|
"""
|
||||||
|
If DO_BREAKPOINTS is switched on, this will pause program
|
||||||
|
execution and wait for the user to press enter to continue.
|
||||||
|
"""
|
||||||
|
if config.obj.DO_BREAKPOINTS:
|
||||||
|
input('BREAKPOINT hit. <Enter> to continue...')
|
||||||
|
|
||||||
|
def translate_text(source_language, target_language, text):
|
||||||
|
translate = yandex_translate.YandexTranslate(config.obj.YANDEX_API_KEY)
|
||||||
|
if not source_language:
|
||||||
|
source_language = translate.detect(text)
|
||||||
|
lang_direction = '%s-%s' % (source_language, target_language)
|
||||||
|
result = translate.translate(text, lang_direction)
|
||||||
|
return result['text'][0]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user