From 6ae2a8dd06d3cc8017f99bb136fd2a1a3ea5cd79 Mon Sep 17 00:00:00 2001 From: Mathew Guest Date: Fri, 24 Jan 2020 03:01:45 -0700 Subject: [PATCH] start of new app structure for wiki --- .gitignore | 5 + launcher.py | 11 - lib/app_skellington/README.md | 60 +++ lib/app_skellington/__init__.py | 11 + lib/app_skellington/_bootstrap.py | 31 ++ lib/app_skellington/_util.py | 116 ++++++ lib/app_skellington/app_container.py | 202 ++++++++++ lib/app_skellington/cfg.py | 184 +++++++++ lib/app_skellington/cli.py | 540 +++++++++++++++++++++++++++ lib/app_skellington/log.py | 165 ++++++++ lib/setup.py | 45 +++ road2philosophy.py | 6 + setup.py | 45 ++- wikicrawl/__init__.py | 3 +- wikicrawl/app.py | 87 +++++ wikicrawl/browser.py | 67 ++-- wikicrawl/cli.py | 203 ---------- wikicrawl/config.py | 22 -- wikicrawl/config.spec | 76 ++++ wikicrawl/dal.py | 6 +- wikicrawl/log.py | 39 -- wikicrawl/main.py | 23 -- wikicrawl/model.py | 45 ++- wikicrawl/pages.py | 21 +- wikicrawl/util.py | 4 +- 25 files changed, 1641 insertions(+), 376 deletions(-) create mode 100644 .gitignore delete mode 100755 launcher.py create mode 100644 lib/app_skellington/README.md create mode 100644 lib/app_skellington/__init__.py create mode 100644 lib/app_skellington/_bootstrap.py create mode 100644 lib/app_skellington/_util.py create mode 100644 lib/app_skellington/app_container.py create mode 100644 lib/app_skellington/cfg.py create mode 100644 lib/app_skellington/cli.py create mode 100644 lib/app_skellington/log.py create mode 100644 lib/setup.py create mode 100755 road2philosophy.py mode change 100644 => 100755 setup.py create mode 100644 wikicrawl/app.py delete mode 100644 wikicrawl/cli.py delete mode 100644 wikicrawl/config.py create mode 100644 wikicrawl/config.spec delete mode 100644 wikicrawl/log.py delete mode 100644 wikicrawl/main.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..41fd0b5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +build/ +dist/ +__pycache__ +*.egg-info + diff --git a/launcher.py b/launcher.py deleted file mode 100755 index 4d37cba..0000000 --- a/launcher.py +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env python - -import wikicrawl -import settings - -# Inject the settings.DefaultSettings object into the -# app and start running the program. -wikicrawl.init(settings.DefaultSettings) -wikicrawl.main() -input(' to exit') - diff --git a/lib/app_skellington/README.md b/lib/app_skellington/README.md new file mode 100644 index 0000000..4025b8f --- /dev/null +++ b/lib/app_skellington/README.md @@ -0,0 +1,60 @@ +app_skellington +=============== +Application framework for Python, features include: + * Pain-free multi-level command menu: Register classes + * Simple to define services and automatic dependency injection based on name (with custom invocation as an option) + * INI-style config and and validation (provided through ConfigObj) + * Colored logging (provided through colorlog) + * Works on Linux, Windows, and Mac + +Principles: + * Lend to creating beautiful, easy to read and understand code in the application. + * Minimize coupling of applications to this framework. + * Compatable with Linux, Windows, and Mac. Try to be compatible as possible otherwise. + * Try to be compatible with alternate Python runtimes such as PyPy. + +Application Configuration +------------------------- +Site configurations are supported through configobj. There is a config.spec +in the src directory which is a validation file; it contains the accepted +parameter names, types, and limits for configurable options in the +application which is built on app_skellington. The format is multi-level .ini syntax. + +See the configobj documentation for more information. + +Site configuration files (config.ini) are created if they don't exit. The +file always contains the full specification of parameters; i.e. even default +parameters are added into the config file. + +Linux: + +/home/\/.config/\/config.ini + +/home/\/.cache/\/log/\.log + +Windows: + +C:\Users\\\\\\\\Local\\\\\config.ini + +C:\Users\\\\\\\\Local\\\\\Logs\\\.log + +Application configuration can be overridden ad-hoc through the --config +argument. + +Debug - Turn on Logging +--------------------------- +Set 'APPSKELLINGTON_ENABLE_LOGGING' environment variable to any value which turns +on AppSkellington-level logging. For example, + + APPSKELLINGTON_ENABLE_LOGGING=true ./runme + +or + + export APPSKELLINGTON_ENABLE_LOGGING=1 + ./runme + +Notes +----- +See official website: https://zavage-software.com +Please report bugs, improvements, or feedback! + diff --git a/lib/app_skellington/__init__.py b/lib/app_skellington/__init__.py new file mode 100644 index 0000000..3f67d9b --- /dev/null +++ b/lib/app_skellington/__init__.py @@ -0,0 +1,11 @@ +import logging +import sys + +APP_CONFIG_FILENAME = 'config.ini' # Relative to user directory on machine +APP_CONFIGSPEC_FILENAME = 'config.spec' # Relative to module source directory + +from .app_container import * +from .cfg import * +from .cli import * +from .log import * + diff --git a/lib/app_skellington/_bootstrap.py b/lib/app_skellington/_bootstrap.py new file mode 100644 index 0000000..155d112 --- /dev/null +++ b/lib/app_skellington/_bootstrap.py @@ -0,0 +1,31 @@ +import logging +import os +import sys + +# Check and gracefully fail if the user needs to install a 3rd-party dep. +libnames = ['appdirs', 'configobj', 'colorlog'] +def check_env_has_dependencies(libnames): + rc = True + for libname in libnames: + try: + __import__(libname) + except ModuleNotFoundError as ex: + print('missing third-part library: ', ex, file=sys.stderr) + rc = False + return rc +if not check_env_has_dependencies(libnames): + print('refusing to load program without installed dependencies', file=sys.stderr) + raise ImportError('python environment needs third-party dependencies installed') + +# Logger for before the application and logging config is loaded +# - used to log before logging is configured +_log_fmt = '%(levelname)-7s:%(message)s' +_logger_name = 'app_skellington' +_bootstrap_logger = logging.getLogger(_logger_name) +_bootstrap_logger.setLevel(1000) +_bootstrap_logger.propagate = False + +# NOTE(MG) Pretty sure the logger has the default handler too at this point. +# It's been related to some issues with the logger double-printing messages. +_bootstrap_logger.addHandler(logging.NullHandler()) + diff --git a/lib/app_skellington/_util.py b/lib/app_skellington/_util.py new file mode 100644 index 0000000..1fd4c04 --- /dev/null +++ b/lib/app_skellington/_util.py @@ -0,0 +1,116 @@ +from __future__ import print_function +import inspect +import os +import sys + +from . import _util + +def eprint(*args, **kwargs): + """ + Print to STDERR stream. + """ + print(*args, file=sys.stderr, **kwargs) + +def filename_to_abspath(filename): + """ + Converts a filename to it's absolute path. If it's already an + absolute path, do nothing. + """ + return os.path.abspath(filename) + +def does_file_exist(filepath): + """ + Because the file can be deleted or created immediately after execution of + this function, there cannot be guarantees made around the existence of + said file (race condition). This merely says if the file existed at this + instant in execution. + """ + try: + fp = open(filepath, 'r') + return True + except FileNotFoundError as ex: + return False + +def ensure_dir_exists(dirpath): + if dirpath is None: + return + if dirpath == '': + return + os.makedirs(dirpath, exist_ok=True) + +def get_root_asset(filepath): + """ + Attempts to locate a resource or asset shipped with the application. + Searches starting at the root module (__main__) which should be the + python file initially invoked. + """ + module_root =\ + os.path.abspath( + os.path.dirname( + sys.modules['__main__'].__file__)) + path = os.path.join(module_root, filepath) + return path + +def get_asset(module, filepath): + """ + Attempts to locate a resource or asset shipped with the application. + Input filename is relative to the caller code, i.e. this starts + searching relative to the file that called this function. + + Returns the full absolute path of the located file if found or None + + Args: + module: Pass in the module (or __name__) to search relative to module + filepath: the relative filepath of the file to look for in the + package directory. + """ + if isinstance(module, str): + module_file = sys.modules[module].__file__ + elif isinstance(module, module): + module_file = module.__file__ + else: + raise Exception('Invalid Usage') + + try: + root = module_file + + if os.path.islink(root): + root = os.path.realpath(root) + + root = os.path.dirname(os.path.abspath(root)) + except Exception as ex: + raise + + path = os.path.join(root, filepath) + return path + +def register_class_as_commands(app, submenu, cls_object): + """ + Registers commands for each class method. e.g.: pass in the CLI + object, the target submenu, and the class to be registered, and + this will create a command-line menu item for each method in + the class. + + IMPORTANT: Currently, you need to pass in only a class and not + an object/instance of a class. + """ + cls_constructor = cls_object + members = inspect.getmembers(cls_object) + for m in members: + name = m[0] + ref = m[1] + if inspect.isfunction(ref) and not name.startswith('_'): + cls_method = ref + constructor = app._inject_service_dependencies(cls_constructor) + sig = inspect.signature(cls_method) + func = create_func(constructor, cls_method) + # docstring = cls_method.__doc__ + docstring = inspect.getdoc(cls_method) + submenu.register_command(func, name, sig, docstring) + +def create_func(constructor, cls_method): + def func(*args, **kwargs): + obj = constructor() + return cls_method(obj, *args, **kwargs) + return func + diff --git a/lib/app_skellington/app_container.py b/lib/app_skellington/app_container.py new file mode 100644 index 0000000..9640639 --- /dev/null +++ b/lib/app_skellington/app_container.py @@ -0,0 +1,202 @@ +import appdirs +import collections +import functools +import inspect +import os +import sys + +# Application scaffolding: +from ._bootstrap import _bootstrap_logger +from . import log +from . import _util +from . import cli +from . import cfg + +DEFAULT_APP_NAME = 'python-app' +DEFAULT_APP_AUTHOR = 'John Doe' + + +# OPTIONAL: classes can sub-class from this? +class Components: + def inject_dependencies_based_on_names_in_args(self): + pass + + def inject_dependency(self, name): + pass + + def register_dependency(self, service, name): + pass + + +class ApplicationContext: + """ + Container for application-wide state; i.e. app configuration and loggers. + """ + def __init__(self, config, log): + self.config = config + self.log = log + self.parsed_argv = None + self.parsed_argv_unknown = None + +class ApplicationContainer: + """ + Generalized application functionality. Used for linking components and modules of the application + together. Invokes runtime configuration reading from file, maintains the + object instances for services, passes off to the cli to determine what to + do, and then injects any necessary dependencies (e.g. database module) + and kicks off the functionality requested in the cli. + """ + def __init__( + self, + configspec_filepath=None, + config_filepath=None, + *args, **kwargs + ): + # Instantiate root application context (container for globals) + if configspec_filepath is None: + configspec_filepath = self._get_configspec_filepath() + + self.appname = kwargs.get('appname') or DEFAULT_APP_NAME + self.appauthor = kwargs.get('appauthor') or DEFAULT_APP_AUTHOR + + self._dependencies = {} + + config = cfg.Config(configspec_filepath) + config.load_config_from_file(config_filepath) + + logger = log.LoggingLayer(self.appname, self.appauthor) + + # added here, is this okay to do twice? + logger.configure_logging() + + self.ctx = ApplicationContext(config, logger) + self['ctx'] = lambda: self.ctx + + self.cli = cli.CommandTree() # Command-line interface + + if callable(getattr(self, '_cli_options', None)): + self._cli_options() + if callable(getattr(self, '_services', None)): + self._services() + if callable(getattr(self, '_command_menu', None)): + self._command_menu() + + def __delitem__(self, service_name): + """ + Deletes a service or dependency from the available dependencies. + """ + try: + del self._dependencies[service_name] + except KeyError as ex: + pass + + def __getitem__(self, service_name): + """ + Returns a factory of a service or dependency. The factory is a function + that is called to return an instance of the service object. + + app_container['netezza'] => returns the netezza service instance + """ + try: + service_factory = self._dependencies[service_name] # Retrieve factory function + return service_factory() # Call factory() to return instance of service + except KeyError as ex: + msg = 'failed to inject service: {}'.format(service_name) + _bootstrap_logger.critical(msg) + _util.eprint(msg) + raise ServiceNotFound + + def __setitem__(self, service_name, value): + """ + Register a service or dependency factory to return a service. + + The factory function is called to return an instance of a service object. + """ + self._dependencies[service_name] = value + + def _construct_model(self, model_constructor, *args): + """ + Performs dependency resolution and instantiates an object of given type. + + This takes in the reference to a class constructor and a list of names + of the dependencies that need passed into it, constructs that object and + returns it. Models contain business logic and application functionality. + + Args: + model_constructor: reference to object constructor. + """ + dependency_names = args + dep_references = [] + for dep_name in dependency_names: + dep_references.append(self[dep_name]) + return model_constructor(*dep_references) + + def _get_config_filepath(self, app_name, app_author, config_filename='config.ini'): + """ + Attempt to find config.ini in the user's config directory. + + On Linux, this will be /home//.config//config.ini + On Windows, this will be C:\\Users\\\\AppData\\Local\\\\config.ini + """ + dirname = appdirs.user_config_dir(app_name, app_author) + filepath = os.path.join(dirname, config_filename) + _bootstrap_logger.info('default config filepath calculated to be: %s', filepath) + return filepath + + def _get_configspec_filepath(self, configspec_filename='config.spec'): + """ + Attempt to find config.spec inside the installed package directory. + """ + return _util.get_root_asset(configspec_filename) + + def _inject_service_dependencies(self, constructor): + """ + Returns a function that, when called, constructs a new object for + business/application logic with the listed dependencies. + + Args: + constructor: service class to be created object. + """ + sig = inspect.signature(constructor.__init__) + params = sig.parameters + params = [params[paramname].name for paramname in params] # Convert Param() type => str + cls_dependencies = params[1:] # Skip 'self' parameter on class methods. + + return functools.partial(self._construct_model, constructor, *cls_dependencies) + + def load_command(self): + args, unk, success = self.cli.parse() + if not success: + return False + self.ctx.parsed_argv = args + self.ctx.parsed_argv_unknown = unk + return True + + def invoke_command(self): + rc = self.load_command() + if not rc: + return False + try: + self.cli.run_command() + except NoCommandSpecified as ex: + print('Failure: No command specified.') + + def interactive_shell(self): + pass + + def invoke_from_cli(self): + self.invoke_command() + + def usage(self): + pass + # Applications need a default usage + +class ServiceNotFound(Exception): + """ + Application framework error: unable to find and inject dependency. + """ + pass + +class NoCommandSpecified(Exception): + pass + diff --git a/lib/app_skellington/cfg.py b/lib/app_skellington/cfg.py new file mode 100644 index 0000000..1b35720 --- /dev/null +++ b/lib/app_skellington/cfg.py @@ -0,0 +1,184 @@ +import appdirs + +from . import _util +import argparse +import configobj +import os +import sys +import validate + +from ._bootstrap import _bootstrap_logger +from . import _util + +class Config: + """ + Structure to store application runtime configuration. Also contains + functionality to load configuration from local site file. + """ + def __init__(self, configspec_filepath=None): + self.config_obj = None + self._config_filepaths = [] + self._configspec_filepath = None + self.configspec_filepath = configspec_filepath + + def __delitem__(self, key): + """ + Deletes the configuration item identified by in the internal + configuration storage. + """ + try: + del self[key] + except KeyError as ex: + pass + + def __getitem__(self, key): + """ + Returns the vaLue of the configuration item identified by . + """ + try: + return self.config_obj[key].dict() + except KeyError as ex: + # raise ConfigurationItemNotFoundError() + raise + + def __setitem__(self, key, value): + """ + Assigns the value of the configuration item + identified by as . + """ + self[key] = value + + @property + def config_filepath(self, idx=0): + """ + Returns the config filepath (optionally specified by index + when using multiple config files). + """ + assert idx>=0, 'invalid idx argument: index must be greater than 0' + if len(self._config_filepaths) > 0: + try: + return self._config_filepaths[idx] + except ValueError as ex: + return + + @config_filepath.setter + def config_filepath(self, value, idx=0): + """ + Assigns as the config filepath (optionally specified by index + when using multiple config files). + """ + assert idx>=0, 'invalid idx argument: index must be greater than 0' + self._config_filepaths[idx] = value + + @property + def configspec_filepath(self): + return self._configspec_filepath + + @configspec_filepath.setter + def configspec_filepath(self, filepath): + if _util.does_file_exist(filepath): + self._configspec_filepath = filepath + else: + _bootstrap_logger.error( + 'failed to set config.spec: file not found ' + '(%s)', filepath) + + def load_config_from_file(self, config_filepath): + """ + Loads configuration settings from file, overwritting all configuration. + """ + # Record all config.ini files passed in + if config_filepath not in self._config_filepaths: + self._config_filepaths.append(config_filepath) + + # Check for config.spec + if self.configspec_filepath: + _bootstrap_logger.info('using config.spec: %s', self.configspec_filepath) + else: + _bootstrap_logger.info('config.spec not defined') + _bootstrap_logger.info('using config file: %s', config_filepath) + + # Pre-check for config.ini existence + if _util.does_file_exist(config_filepath): + _bootstrap_logger.info('existing config file found') + else: + _bootstrap_logger.info('no config file found: using defaults') + + # interpolation='template' changes config file variable replacement to + # use the form $var instead of %(var)s, which is useful to enable + # literal %(text)s values in the config. + try: + configspec_filepath = self.configspec_filepath + if configspec_filepath: + self.config_obj = configobj.ConfigObj( + config_filepath, + configspec=configspec_filepath, + interpolation='template' + ) + else: + self.config_obj = configobj.ConfigObj( + config_filepath, + # configspec=configspec_filepath, + interpolation='template' + ) + except configobj.ParseError as ex: + msg = 'failed to load config: error in config.spec configuration: {}'.format(config_filepath) + _bootstrap_logger.error(msg) + _util.eprint(msg) + return False + except OSError as ex: + msg = 'failed to load config: config.spec file not found' + _bootstrap_logger.error(msg) + _util.eprint(msg) + return False + + + # Hack the configobj module to alter the interpolation for validate.py: + configobj.DEFAULT_INTERPOLATION = 'template' + self.config_obj.filename = config_filepath + + + if self.configspec_filepath: + # Validate config.ini against config.spec + try: + _bootstrap_logger.info('validating config file against spec') + val = validate.Validator() + test = self.config_obj.validate(val, copy=True) + if test is not True: + _bootstrap_logger.critical('config file failed validation') + _bootstrap_logger.critical('config file errors: %s', test) + return False + except ValueError as ex: + _bootstrap_logger.error('failed validating configspec') + return False + + # Create the config file if it doesn't exist + # if not _util.does_file_exist(config_filepath): + if True: + _bootstrap_logger.info('writing new config file: %s', config_filepath) + dirname = os.path.dirname(config_filepath) + _util.ensure_dir_exists(dirname) + self.config_obj.write() + + _bootstrap_logger.info('done loading config file') + return True + + def print_config(self): + """ + Print configuration to stdout. + """ + print('config:') + + self.config_obj.walk(print) + for section in self.config_obj.sections: + print(section) + for key in self.config_obj[section]: + print(' ', self.config_obj[section][key]) + +class EnvironmentVariables: + def __init__(self): + raise NotImplementedError + +class ConfigurationItemNotFoundError(Exception): + pass + diff --git a/lib/app_skellington/cli.py b/lib/app_skellington/cli.py new file mode 100644 index 0000000..f1f4155 --- /dev/null +++ b/lib/app_skellington/cli.py @@ -0,0 +1,540 @@ +import argparse +import inspect +import logging +import re +import sys + +import app_skellington +from ._bootstrap import _bootstrap_logger +from . import app_container + +# If explicit fail is enabled, any command with at least one unknown +# argument will be rejected entirely. If not enabled, unknown arguments +# will be ignored. +EXPLICIT_FAIL_ON_UNKNOWN_ARGS = True + +class CommandTree: + """ + Command-line interface to hold a menu of commands. You can register + commands (functions or methods) in a CommandTree which will generate + a corresponding argparse.ArgumentParser (and nested SubParsers) that + map function/method arguments into argparse Parameters. Then, you + can translate command-line arguments into invoking the function. + + Commands must be registered before being invoked. You create nested + SubMenu(s). If function parameters have defaults, those will be + available for override else they use the function defaults. + + Print helpful information: + + ./scriptname -h # View tier-0 help and usage doc + ./scriptname [submenu] -h # View submenu help and usage doc + ./scriptname [submenu] [command] -h # View command documentation and parameters + + argparse is finicky about argument placement: + + ./scriptname + [application arguments] + [submenu] [submenu arguments] + [command] [command arguments] + + For example, + + ./scriptname --option="value" [submenu] [command] + + is different than + + ./scriptname [submenu] [command] --option="value" + + in that option is being applied to the application in the first example and + applied to the refresh_datasets command (under the nhsn command group) in + the second. In the same way the -h, --help options print different docs + depending on where the help option was passed. + """ + def __init__(self): + self.root_parser = argparse.ArgumentParser() + self.submenu_param = None # submenu_param is the variable name + # of the root submenu argument, i.e. the arg + # in root_parser which selects the submenu. + self.entries = {} + # NOTE(MG) Implementation note: + # CommandTree uses only one of these internal structures (i.e. mutually exclusive), + # 'entries' is used when there is a submenu linked to multiple commands. + # '_cmd_tree_is_single_command' and '_single_command' instead are used + # when the CommandTree is linked to one and only one command. + self._cmd_tree_is_single_command = False + self._single_command = None + + def print_tree(self): + import pprint + pprint.pprint(self.entries) + + def add_argument(self, *args, **kwargs): + """ + Adds an argument to the root parser. + """ + _bootstrap_logger.info('adding argument to root parser: %s and %s', args, kwargs) + self.root_parser.add_argument(*args, **kwargs) + + def init_submenu(self, param_name, is_required=False): + """ + Creates a root-level submenu with no entries. SubMenu node is + returned which can have submenus and commands attached to it. + """ + # Creates an argument as a slot in the underlying argparse. + subparsers = self.root_parser.add_subparsers( + dest = param_name, + metavar = param_name, + required = is_required + ) + + submenu = SubMenu(self, subparsers, param_name) + submenu.submenu_path = '' + submenu.var_name = param_name + + _bootstrap_logger.info('Initialized root-level submenu: Parameter = \'%s\'', param_name) + self.entries[param_name] = submenu + self.submenu_param = param_name + + return submenu + + def register_command( + self, func, cmd_name=None, func_signature=None, + docstring=None + ): + """ + When no submenu functionality is desired, this links a single + command into underlying argparse options. + """ + # begin copy-paste from SubMenu.register_command + if inspect.isfunction(func): + # print('func is function') + pass + elif inspect.ismethod(func): + pass + # print('func is method') + else: + raise Exception('bad value passed in for function') + + if not cmd_name: + # safe try/except + cmd_name = func.__name__ + + if func_signature is None: + func_signature = inspect.signature(func) + + if docstring is None: + docstring = func.__doc__ + + sig = func_signature + params = sig.parameters + + # help is displayed next to the command in the submenu enumeration or + # list of commands: + help_text = HelpGenerator.generate_help_from_sig(docstring) + # description is displayed when querying help for the specific command: + description_text = HelpGenerator.generate_description_from_sig(docstring) + # end copy-paste from SubMenu.register_command + + # begin copy-paste then editted from SubMenu.register_command + # For each paramter in the function create an argparse argument in + # the child ArgumentParser created for this menu entry: + for key in params: + if key == 'self': + continue + param = params[key] + + if '=' in str(param): + if param.default is None: + helptext = 'default provided' + else: + helptext = "default = '{}'".format(param.default) + self.root_parser.add_argument( + key, + help=helptext, + nargs='?', + default=param.default) + else: + helptext = 'required' + self.root_parser.add_argument( + key, + help=helptext) + + # # Wrapper function that instantiates an object and runs a method + # # on-demand. The object is created, injected with necessary + # # dependencies or services, and the method is invoked. + # def func(*args, **kwargs): + # obj = constructor() + # return cls_method(obj, *args, **kwargs) + + # Build the CommandEntry structure + cmd = CommandEntry() + cmd.argparse_node = self.root_parser + cmd.cmd_name = cmd_name + cmd.func_signature = sig + # cmd.func_ref = None + cmd.callback = func + + registered_name = cmd_name + _bootstrap_logger.info('registered command: %s', registered_name) + # end copy-paste then editted from SubMenu.register_command + + self._cmd_tree_is_single_command = True + self._single_command = cmd + self._entries = None + + # def _validate(self): + # pass + # # TODO(MG): + # # subparser can not be empty, needs to have parsers attached + + def parse(self, args=None): + if args is None: + args = sys.argv[1:] + + try: + # on error, prints some argparse error messages: + pargs, unk = self.root_parser.parse_known_args(args) + + # if len(unk) > 0: + # _bootstrap_logger.error( + # 'failed to interpret argument(s) or command-line switch from shell: %s', + # unk) + + # if EXPLICIT_FAIL_ON_UNKNOWN_ARGS: + # _bootstrap_logger.warn( + # 'failed to parse arguments: explicitly failing to be safe') + # return False, False + + if hasattr(pargs, 'usage'): + pass + # print('found usage in app_skellington') + + return pargs, unk, True + + # Note: SystemExit is raised when '-h' argument is supplied. + except SystemExit as ex: + return None, None, False + + def run_command(self, args=None): + args, unk, success = self.parse(args) + if not success: + _bootstrap_logger.info('SystemExit: Perhaps user invoked --help') + return + + if args is False and unk is False: + _bootstrap_logger.error('failed parsing args') + return False + _bootstrap_logger.info('received args from shell: %s', args) + + args = vars(args) + + cmd = self._lookup_command(args) + if cmd is None: + print('cmd is None') + _bootstrap_logger.error('failed to find command') + return False + + return self._invoke_command(cmd, args) + + def _lookup_command(self, args): + keys = list(args.keys()) + + # In the case there is at-most one command registered in + # the CommandTree with no SubMenu (submenu will be disabled + # in this case): + if self._cmd_tree_is_single_command: + assert self._cmd_tree_is_single_command is True, 'corrupt data structure in CommandMenu' + assert self._entries is None, 'corrupt data structure in CommandMenu' + assert isinstance(self._single_command, CommandEntry), 'corrupt data structure in CommandMenu' + return self._single_command + + # There is at least one submenu we need to go down: + else: + + assert self._single_command is None, 'corrupt data structure in CommandMenu' + assert self._cmd_tree_is_single_command == False, 'corrupt data structure in CommandMenu' + + # Key or variable name used by argparse to store the submenu options + argparse_param = self.submenu_param # e.g.: submenu_root + submenu = self.entries[argparse_param] + + while True: + if argparse_param not in keys: + print('root menu parameter not found in args:', argparse_param) + input('') + + val = args.get(argparse_param) + _bootstrap_logger.debug('argparse command is \'{}\' = {}'.format(argparse_param, val)) + + lookup = submenu.entries.get(val) + _bootstrap_logger.debug('lookup, entries[{}] = {}'.format(val, lookup)) + # print(submenu.entries) + + # pop value + del args[argparse_param] + + if isinstance(lookup, SubMenu): + submenu = lookup + argparse_param = submenu.var_name + elif isinstance(lookup, CommandEntry): + return lookup + # return self._invoke_command(lookup, args) + + else: + raise app_container.NoCommandSpecified('No command specified.') + + def _invoke_command(self, cmd, args): + func = cmd.callback + sig = cmd.func_signature + params = sig.parameters + params = [params[paramname] for paramname in params] + func_args = [] + for param in params: + if param.name in args: + func_args.append(args[param.name]) + + _bootstrap_logger.info('function: %s', func) + _bootstrap_logger.info('function args: %s', func_args) + return func(*func_args) + + def _get_subparser(self): + return self.root_parser._subparsers._actions[1] + +class SubMenu: + def __init__(self, parent, subparsers_obj, name): + self.parent = parent # Reference to root CommandTree + self.subparsers_obj = subparsers_obj + self.name = name + self.submenu_path = None + + self.entries = {} + + def register_command( + self, func, cmd_name=None, func_signature=None, + docstring=None + ): + """ + Registers a command as an entry in this submenu. Provided function is + converted into argparse arguments and made available to the user. + + Arguments + --------- + func: + Callback function which will be mapped + to the submenu entry. + + cmd_name (optional): + User-facing entry name. By default will be the function name. + The user will be able to use [cmd_name] [arg, ...] to + invoke the callback function. + + func_signature: optionally, you can pass in the + inspect.signature(). If None, will inspect the + incoming func. Note on internals: This is used + to pass the function signature of the command + function while having the callback point to a + function partial which executes some other code. + This hook is used to inject dependencies and then + execute the command function. + """ + if inspect.isfunction(func): + # print('func is function') + pass + elif inspect.ismethod(func): + pass + # print('func is method') + else: + raise Exception('bad value passed in for function') + + if not cmd_name: + # safe try/except + cmd_name = func.__name__ + + if func_signature is None: + func_signature = inspect.signature(func) + + if docstring is None: + docstring = func.__doc__ + + sig = func_signature + params = sig.parameters + + # help is displayed next to the command in the submenu enumeration or + # list of commands: + help_text = HelpGenerator.generate_help_from_sig(docstring) + # description is displayed when querying help for the specific command: + description_text = HelpGenerator.generate_description_from_sig(docstring) + + # Entry in local argparse._SubParsersAction + # type = ArgumentParser + child_node = self.subparsers_obj.add_parser( + cmd_name, # Note: cmd_name here will be the VALUE + # passed into the argparse arg VARIABLE NAME + # created when the SubMenu/argparse.addZ_subparsers() + # was created. + help=help_text, + description=description_text + ) + + # For each paramter in the function create an argparse argument in + # the child ArgumentParser created for this menu entry: + for key in params: + if key == 'self': + continue + param = params[key] + + if '=' in str(param): + if param.default is None: + helptext = 'default provided' + else: + helptext = "default = '{}'".format(param.default) + child_node.add_argument( + key, + help=helptext, + nargs='?', + default=param.default) + else: + helptext = 'required' + child_node.add_argument( + key, + help=helptext) + + # # Wrapper function that instantiates an object and runs a method + # # on-demand. The object is created, injected with necessary + # # dependencies or services, and the method is invoked. + # def func(*args, **kwargs): + # obj = constructor() + # return cls_method(obj, *args, **kwargs) + + # Build the CommandEntry structure + cmd = CommandEntry() + cmd.argparse_node = child_node + cmd.cmd_name = cmd_name + cmd.func_signature = sig + # cmd.func_ref = None + cmd.callback = func + + registered_name = '{}.{}'.format( + self.submenu_path, + cmd_name) + _bootstrap_logger.info('registered command: %s', registered_name) + self.entries[cmd_name] = cmd + + def create_submenu( + self, var_name, cmd_entry_name=None, is_required=False + ): + """ + Creates a child-submenu. + + Arguments + --------- + var_name: + A code-facing argparse parameter used to store the + value/entry chosen by the user. + + cmd_entry_name: + A user-facing name used to select created submenu. + If not provided, the user-facing command name defaults + to the same name as the code-facing argparse parameter + + is_required: + Switches if a value must be selected in the created submenu. + If not, it's an optional positional argument. + """ + if cmd_entry_name is None: + cmd_entry_name = var_name + + # Create an entry in self's submenu: + # type = ArgumentParser + entry_node = self.subparsers_obj.add_parser( + cmd_entry_name, + help='sub-submenu help', + description='sub-sub description') + + # Turn entry into a submenu of it's own: + # type = _SubParsersAction + subp_node = entry_node.add_subparsers( + dest = var_name, + metavar = var_name, + required = is_required) + + submenu = SubMenu( + self.parent, + subp_node, + cmd_entry_name) + + submenu.var_name = var_name + + submenu.submenu_path = '{}.{}'.format(self.submenu_path, cmd_entry_name) + submenu_name = submenu.submenu_path + + _bootstrap_logger.info('registered submenu: %s', submenu_name) + self.entries[cmd_entry_name] = submenu + return submenu + + def __repr__(self): + return 'SubMenu({})<{}>'.format( + self.name, + ','.join(['cmds']) + ) + +class CommandEntry: + """ + Structure for a command-entry in the CLI. + + Stores the command-subcommand names, the function signature which contains + the original parameters of the function-to-be-invoked, a reference to the + original function, and a callback function wrapper which, by convention, + instantiates the necessary objects (injecting dependencies, etc.) and + executes the original function. + + The CLI module has functionality to translate the original function + arguments into argparse options (creating the documentation also). Similary, + it can convert from argparse options into a function call. + """ + def __init__(self): + self.argparse_node = None + + self.cmd_name = None # Don't think we need. And needs to be changed + # from SubMenu + self.menu_path = None + self.func_signature = None + self.func_ref = None + self.callback = None + + def __repr__(self): + return 'CommandEntry<{}>'.format(self.cmd_name) + +class HelpGenerator: + def __init__(self): + pass + + @staticmethod + def generate_help_from_sig(doctext): + """ + The 'help' text is displayed next to the command when enumerating + the submenu commands. + """ + if doctext == None: + return doctext + regex = '(.*?)[.?!]' + match = re.match(regex, doctext, re.MULTILINE | re.DOTALL) + if match: + return match.group(1) + '.' + return doctext + + @staticmethod + def generate_description_from_sig(doctext): + """ + The 'description' paragraph is provided when the user requests help + on a specific command. + """ + if doctext == None: + return doctext + regex = '(.*?)[.?!]' + match = re.match(regex, doctext, re.MULTILINE | re.DOTALL) + if match: + return match.group(1) + '.' + return doctext + diff --git a/lib/app_skellington/log.py b/lib/app_skellington/log.py new file mode 100644 index 0000000..5fb308e --- /dev/null +++ b/lib/app_skellington/log.py @@ -0,0 +1,165 @@ +import appdirs +import colorlog +import logging +import logging.config +import os + +from ._bootstrap import _bootstrap_logger +from . import _util + +DEFAULT_LOG_SETTINGS = { + 'formatters': { + 'colored': { + 'class': 'colorlog.ColoredFormatter', + # 'format': '%(log_color)s%(levelname)-8s%(reset)s:%(log_color)s%(name)-5s%(reset)s:%(white)s%(message)s' + 'format': '%(white)s%(name)7s%(reset)s|%(log_color)s%(message)s', + } + }, + + 'handlers': { + 'stderr': { + 'class': 'logging.StreamHandler', + 'level': 'debug', + 'formatter': 'colored' + } + + }, + + 'loggers': { + 'root': { + 'handlers': ['stderr',], + 'level': 'debug' + }, + 'app_skellington': { + # 'handlers': ['stderr',], + 'level': 'critical', + 'propagate': 'false' + } + } +} + +class LoggingLayer: + def __init__(self, appname, appauthor, config=None): + self.appname = appname + self.appauthor = appauthor + self.loggers = {} + + def __getitem__(self, k): + """ + Returns Logger object named . + + Example: + log = LoggingLayer(...) + log['db'].info('loaded database module') + + Args: + k: the name of the logger to retrieve (k, i.e. key) + """ + logger = self.loggers.get(k) + if not logger: + logger = logging.getLogger(k) + self.loggers[k] = logger + return logger + + def configure_logging(self, config_dict=None): + """ + Set the logging level for the process. Verbosity is controlled by a + parameter in the config. + + Advice: While DEBUG verbosity is useful to debug, it can produce too much + noise for typical operation. + """ + if config_dict is None: + _bootstrap_logger.debug('No application logging configuration provided. Using default') + config_dict = DEFAULT_LOG_SETTINGS + + self.transform_config(config_dict) + + try: + # TODO(MG) switch to pretty-print, as it'd be more human readable + _bootstrap_logger.debug('Log configuration: %s', config_dict) + logging.config.dictConfig(config_dict) + except Exception as ex: + print('unable to configure logging:', ex, type(ex)) + + def transform_config(self, config_dict): + """ + Fix some incompatibilities and differences between the config-file logging + parameters and the final config dictionary passed into the logging module. + """ + # Version should be hard-coded 1, per Python docs + if 'version' in config_dict: + if config_dict['version'] != 1: + _bootstrap_logger.warn("logging['version'] must be '1' per Python docs") + config_dict['version'] = 1 + + self._add_own_logconfig(config_dict) + + # Replace logger level strings with value integers from module + for handler in config_dict['handlers']: + d = config_dict['handlers'][handler] + self._convert_str_to_loglevel(d, 'level') + + # Replace logger level strings with value integers from module + for logger in config_dict['loggers']: + d = config_dict['loggers'][logger] + self._convert_str_to_loglevel(d, 'level') + + # Replace 'root' logger with '', logging module convention for root handler + # Note: '' is disallowed in ConfigObj (hence the reason for this replacement) + config_dict['loggers'][''] = config_dict['loggers']['root'] + del config_dict['loggers']['root'] + + + # Evaluate the full filepath of the file handler + if 'file' not in config_dict['handlers']: + return + + if os.path.abspath(config_dict['handlers']['file']['filename']) ==\ + config_dict['handlers']['file']['filename']: + # Path is already absolute + pass + else: + dirname = appdirs.user_log_dir(self.appname, self.appauthor) + _util.ensure_dir_exists(dirname) + log_filepath = os.path.join(dirname, config_dict['handlers']['file']['filename']) + config_dict['handlers']['file']['filename'] = log_filepath + + def _add_own_logconfig(self, config_dict): + if os.environ.get('APPSKELLINGTON_ENABLE_LOGGING', None): + if 'app_skellington' not in config_dict['loggers']: + config_dict['loggers']['app_skellington'] = { + 'level': 'debug', 'propagate': 'false' + } + else: + config_dict['loggers']['app_skellington']['level'] = 'debug' + + def _convert_str_to_loglevel(self, dict_, key): + """ + Convert a dictionary value from a string representation of a log level + into the numeric value of that log level. The value is modified in-place + and is passed in by a dictionary reference and a key name. + + For example, + d = {'loggers': {'cas': {'level': 'critical'}}} + convert_str_to_loglevel(d['loggers']['cas'], 'level') + => + d is now {'loggers': {'cas': {'level': logging.CRITICAL}}} + """ + try: + s = dict_[key] + except KeyError as ex: + raise + if s == 'critical': + dict_[key] = logging.CRITICAL + elif s == 'error': + dict_[key] = logging.ERROR + elif s == 'warning': + dict_[key] = logging.WARNING + elif s == 'info': + dict_[key] = logging.INFO + elif s == 'debug': + dict_[key] = logging.DEBUG + elif s == 'all': + dict_[key] = logging.NOTSET + diff --git a/lib/setup.py b/lib/setup.py new file mode 100644 index 0000000..6127a3c --- /dev/null +++ b/lib/setup.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python +# +# Usage: +# +# First, enable the python environment you want to install to, or if installing +# system-wide then ensure you're logged in with sufficient permissions +# (admin or root to install to system directories) +# +# installation: +# +# $ ./setup.py install +# +# de-installation: +# +# $ pip uninstall + + +from setuptools import setup + +__project__ = 'app_skellington' +__version__ = '0.1.0' + +setup( + name = __project__, + version = __version__, + description = 'A high-powered 2-level CLI framework', + author = 'Mathew Guest', + author_email = 'mathewguest@gmail.com', + url = 'https://git-mirror.zavage-software.com', + + # Third-party dependencies; will be automatically installed + install_requires = ( + 'appdirs', + 'configobj', + 'colorlog', + 'pprint', + ), + + # Local packages to be installed (our packages) + packages = ( + 'app_skellington', + ), + +) + diff --git a/road2philosophy.py b/road2philosophy.py new file mode 100755 index 0000000..7b2ffd0 --- /dev/null +++ b/road2philosophy.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python +import wikicrawl +import settings +wikicrawl.start_app() +input(' to exit') + diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 index 71c2663..5c9f4f3 --- a/setup.py +++ b/setup.py @@ -3,23 +3,42 @@ # required third-party dependencies and package the app. You can also # install the application system-wide. -from setuptools import setup +from setuptools import setup, find_packages __project__ = 'wikicrawl' # If you're looking for a versioning scheme, one revered pattern # can be read about at http://semver.org __version__ = '0.9.0' -setup(name = __project__, - version = __version__, - description = '', - author = '', - author_email = '', - url = '', - install_requires = ('yandex.translate', - 'selenium', - 'colorlog', - 'baker' - ), - packages = ('wikicrawl',)) +setup( + name = __project__, + version = __version__, + description = '', + author = '', + author_email = '', + url = '', + install_requires = ( + 'yandex.translate', + 'selenium', + 'colorlog', + 'baker', + + ## Additional dependencies required from app_skellington: + 'appdirs', + 'configobj', + 'colorlog', + 'pprint' + ), + packages = find_packages( + where='.', + include=('*',), + exclude=() + ), + package_dir = { + 'app_skellington': 'lib' + }, + scripts = ( + 'road2philosophy.py', + ) +) diff --git a/wikicrawl/__init__.py b/wikicrawl/__init__.py index 55caacd..d0a6e46 100644 --- a/wikicrawl/__init__.py +++ b/wikicrawl/__init__.py @@ -7,6 +7,5 @@ # We export config.init() as a reference to wikicrawl.config.init() and # wikicrawl.main as a reference to wikicrawl.cli.main -from .config import init -from .main import main +from .app import start_app diff --git a/wikicrawl/app.py b/wikicrawl/app.py new file mode 100644 index 0000000..5f202bb --- /dev/null +++ b/wikicrawl/app.py @@ -0,0 +1,87 @@ +# The command-line interface module creates an interface for +# interacting with the python program (wikicrawl). This is an implementation +# of the baker demo shown previously. The user can type in commands to +# make the program do things. + +import sys +if sys.platform == 'linux': + import readline # Needed for command history and arrows to work + +from . import browser +from . import dal +from . import model + +from app_skellington import _util +import app_skellington +from app_skellington._util import register_class_as_commands + +# Problem pages: +# Decision (from politics) +# Malaysia (goes inside parenthesis) +# Soft-sediment_deformation_structures (doesn't find link) +# Chemicals (loops at philosophical) + +class InteractiveInterface(app_skellington.ApplicationContainer): + def __init__(self, *args, **kwargs): + configspec_filepath = _util.get_asset(__name__, 'config.spec') + config_filepath = self._get_config_filepath( + 'road2philosophy', '', 'config.ini' + ) + + super().__init__( + configspec_filepath=configspec_filepath, + config_filepath=config_filepath, + *args, **kwargs + ) + + # Configure logging: + # log_config = self.ctx.config['logging'] + # self.ctx.log.configure_logging(log_config) + # self.ctx.log.configure_logging() + + def invoke_from_cli(self): + rc = self.load_command() + if not rc: + print('Invalid command. Try -h for usage') + return + # load config + self.invoke_command() + + def usage(self): + s = ''' +...usage info to come ;)... +''' + print(s) + + def _cli_options(self): + self.cli.add_argument( + '--usage', + help='Prints program usage information', + action='store_true' + ) + + def _services(self): + self['ctx'] = lambda: self.ctx + + self.dal = dal.DataLayer(self.ctx) + self['dal'] = lambda: self.dal + + self.browser = browser.Browser(self.ctx) + self['browser'] = lambda: self.browser + + self.model = model.Model(self.ctx, self.browser, self.dal) + self['model'] = lambda: self.model + + def _command_menu(self): + sm_root = self.cli.init_submenu('command') + self.sm_root = sm_root + + register_class_as_commands( + self, sm_root, + model.Model + ) + +def start_app(config=None): + x = InteractiveInterface() + x.invoke_from_cli() + diff --git a/wikicrawl/browser.py b/wikicrawl/browser.py index a4a7f29..ed63c01 100755 --- a/wikicrawl/browser.py +++ b/wikicrawl/browser.py @@ -12,39 +12,44 @@ import selenium import selenium.webdriver -from . import config -from . import log +class Browser: + def __init__(self, ctx): + self.ctx = ctx -# This function has a parameter (driver) that passes in a value. In this case, -# this driver variable defaults to the string 'chrome'. The code can call -# create_webdriver() which is the same as create_webdriver('chrome') but -# can alternatively call create_webdriver('firefox') and get different -# functionality. -def create_webdriver(driver='chrome'): - if driver == 'chrome': - return create_webdriver_chrome() - elif driver == 'firefox': - return create_webdriver_firefox() - elif driver == 'phantom': - return create_webdriver_phantom() - else: - log.LOGGER('browser').error('unable to handle webdriver request: %s' % driver) - return + # This function has a parameter (driver) that passes in a value. In this case, + # this driver variable defaults to the string 'chrome'. The code can call + # create_webdriver() which is the same as create_webdriver('chrome') but + # can alternatively call create_webdriver('firefox') and get different + # functionality. + def create_webdriver(self, driver='chrome'): + if driver == 'chrome': + return self.create_webdriver_chrome() + elif driver == 'firefox': + return self.create_webdriver_firefox() + elif driver == 'phantom': + return self.create_webdriver_phantom() + else: + log.LOGGER('browser').error('unable to handle webdriver request: %s' % driver) + return -def create_webdriver_firefox(): - profile = selenium.webdriver.FirefoxProfile() - profile.set_preference("general.useragent.override", config.obj.WEBDRIVER_USER_AGENT) - driver = selenium.webdriver.Firefox(profile) - return driver + def create_webdriver_firefox(self): + profile = selenium.webdriver.FirefoxProfile() + profile.set_preference( + "general.useragent.override", + self.ctx.config['app']['webdriver_user_agent']) + driver = selenium.webdriver.Firefox(profile) + return driver -def create_webdriver_chrome(): - opt = selenium.webdriver.chrome.options.Options() - opt.add_argument('--user-agent=' + config.obj.WEBDRIVER_USER_AGENT) - driver = selenium.webdriver.Chrome(executable_path=config.obj.CHROMEDRIVER_EXE, - chrome_options=opt) - return driver + def create_webdriver_chrome(self): + opt = selenium.webdriver.chrome.options.Options() + opt.add_argument('--user-agent=' + self.ctx.config['app']['webdriver_user_agent']) + driver = selenium.webdriver.Chrome( + executable_path=self.ctx.config['app']['chromedriver_exe'], + chrome_options=opt + ) + return driver -def create_webdriver_phantom(): - driver = selenium.webdriver.PhantomJS() - return driver + def create_webdriver_phantom(self): + driver = selenium.webdriver.PhantomJS() + return driver diff --git a/wikicrawl/cli.py b/wikicrawl/cli.py deleted file mode 100644 index e14a06e..0000000 --- a/wikicrawl/cli.py +++ /dev/null @@ -1,203 +0,0 @@ -# The command-line interface module creates an interface for -# interacting with the python program (wikicrawl). This is an implementation -# of the baker demo shown previously. The user can type in commands to -# make the program do things. - -import baker -import sys - -if sys.platform == 'linux': - import readline # Needed for command history and arrows to work - -from . import log -from . import model -from . import config - -# Problem pages: -# Decision (from politics) -# Malaysia (goes inside parenthesis) -# Soft-sediment_deformation_structures (doesn't find link) -# Chemicals (loops at philosophical) - -commander = baker.Baker() - -class InteractiveInterface: - def __init__(self): - # Instantiate the variable self.model as an object - # of instance of the Model class defined in the model - # module. model.Model refers to the Model class in the - # model module and this line creates a new variable (self.model) - # which is a variable that is an instance of Model, i.e. - # it has the type Model and has Model.methods() available - # to it. - # - # self.model is a variable that is attached to the instance/object - # returned by this constructor that has the type InteractiveInterface. - self.model = model.Model() - - def run_command(self, args, main=True): - """ - Runs the command-line interface for a single command. - - If called by InteractiveInterface.run(sys.argv), this method - will execute the commands and arguments specified on command - line when running this program. Alternatively, the code could - pass in a different set of arguments to specify what to do. - See start_command_loop() for more information. - """ - try: - commander.run(argv=args, main=True, help_on_error=True, - instance=self) - except baker.CommandError as ex: - log.LOGGER['cli'].warn('incorrect user input: %s' % ex) - commander.usage() - except baker.TopHelp as ex: - commander.usage() - except Exception as ex: - log.LOGGER['cli'].error('caught general exception!!') - log.LOGGER['cli'].error(ex) - - def start_command_loop(self): - """ - Repeatedly asks the user what command to run until they exit. - - This method calls InteractiveInterface.run(args) a little bit - differently. Instead of passing the arguments from the command-line - that were passed in when invoking the python wikicrawl app, - this asks the user for a line of textual input and passes - those strings to run() as the arguments. This way, the user can - access an interactive shell and repeatedly issue different - commands while the application is running. - """ - commander.usage() - self.model.open_browser() - while True: - print('$ ', end = '') # Display to the user a command prompt - # The dollar-sign is a common indication - # of a shell that communicates to the user - # that we are waiting for their textual - # input. The end = '' indicates to python - # to NOT drop to a newline after printing - # in the terminal. Instead, let the user - # type their command on the same line as - # the printed '$ '. - try: - inp = input() - except EOFError: # +D will send "End Line" and exit the command loop - break - - # Note on "arguments" (mg): - # Whenever a program is run in windows or *nix, the operating - # system passes in the command string that was used to invoke - # the program. You can append data in that command to configure - # switches or values going into the program on the fly. For - # example, you can invoke this wikicrawl app in more than one - # way. You can of course run "python launcher.py" to run the - # software but you can also pass in an argument. You can - # alternatively run "python launcher.py ..." - # and the operating system will provide the values into - # the process that is running as variables. - # - # In a real world use case, many commands provide switches to - # adjust what the program does. For example, - # - # The command: - # find music -name "*justin*bieber*" - # runs the "find" program and asks to find all the filenames that match the - # pattern *justin*bieber* in the "music" directory. - # (music, -name, "*justin*biever*") are argument parameters - # that are passed into the program. The program is coded to - # parse and interpret these values and execute differently based - # on the values passed in. This is one way to pass in information - # into a running program. Some other ways are to read from a file - # (such as how we read from settings.py to load the runtime - # configuration), from something called environment variables - # (won't get into but another set of values provided to programs - # from the operating system), or they can be hard-coded into - # the application. - # - # Side note: arguments are not unique to python (almost all - # programming languages implement arguments), the functionality - # is defined by the application (some programs require arguments, - # some are optional, and the syntax for sending in argument - # parameters are different and defined by the individual programs, - # and lastly, the first argument sent in is the script name or - # filename of the script. In our case, the first argument is - # the string "launcher.py". If the user invoked the command - # as C:\Users\mguest\launcher.py then the first argument - # would be C:\Users\mguest\launcher.py. - - # What this method (start_command_loop()) does is provide a - # REPL shell which is a - # read-eval-print-loop. It repeatedly asks the user for an - # input (read), evaluates that input into an action (evaluate), - # give the user some feedback (print), and start the process - # over again (loop). When you call just "python", you are loading a - # program that gives you a REPL interactive shell. The way - # this wikicrawl app is implemented gives the user a REPL - # that has commands to interact with wikipedia pages. - - # Because we take in the input as a single string, we do - # a transformation to turn something like "do_random_page 5" - # into ["launcher.py", "do_random_page", "5"] which is how - # the arguments array would have been created if it were - # passed in the initial command instead of typed and interpretted - # as input as is done here. - args = [sys.argv[0], ] + inp.split() - - # The user can at any point in the command pass the argument - # switch "--help". If doing this, the command line interface - # will instead print out the inline documentation associated - # with this command and quit after doing so. For example, - # the user can type "python launcher.py do_random_page --help" - # and the program will spit out the generated documentation - # for the do_random_page command and run nothing. In our case, - # this documentation is created by the baker library and will - # print out the docstring associated with the method. Try it - # out in your shell (cmd.exe or powershell.exe) by invoking - # python launcher.py do_random_page --help - # You will see the program spit out the heredoc below the - # do_random_page method defined below. - if '--help' in args: - args.remove('--help') - try: - print('command usage:') - commander.usage(args[1]) - except Exception as ex: - print(type(ex), ex) - continue - - self.run_command(args, main=False) - - @commander.command - def play_specific_page(self, title): - pass - - @commander.command - def play_random_page(self): - """ - Instructs the wikicrawl application to play the game on a random - article. - """ - self.model.play_random_page() - - @commander.command - def play_multiple(self, n): - """ - Plays the wikicrawl game -times. - """ - try: - n = int(n) - except ValueError as ex: - log.LOGGER['cli'].warn('failed to process "%s" as a parameter' % n) - return False - for i in range(n): - self.model.play_random_page() - - @commander.command - def exit(self): - """ - Immediately exit the program. - """ - sys.exit(0) - diff --git a/wikicrawl/config.py b/wikicrawl/config.py deleted file mode 100644 index c4695d5..0000000 --- a/wikicrawl/config.py +++ /dev/null @@ -1,22 +0,0 @@ -# config module defines a place to store the external configuration/settings -# and is used to provide an interface to the runtime configuration for the -# program. - -import sys - -from . import log - -obj = {} - -def init(settings_obj): - global obj - obj = settings_obj - - find_chromedriver_path() - - - log.init_logging() - -def find_chromedriver_path(): - print(__file__) - diff --git a/wikicrawl/config.spec b/wikicrawl/config.spec new file mode 100644 index 0000000..480b795 --- /dev/null +++ b/wikicrawl/config.spec @@ -0,0 +1,76 @@ +[app] +# Filepath parameters - THESE MUST EXIST OR PROGRAM WILL NOT RUN!! +log_filename = string(min=0, max=255, default='/tmp/wikicrawl.log') +sqlite_dbfile = string(min=0, max=255, default='/home/mathew/.wikicrawler.db') +chromedriver_exe = string(min=0, max=255, default='/usr/bin/chromedriver') +# CHROMEDRIVER_EXE = 'C:\\Users\\mathew\\windows-share\\dev\\wikicrawl\\chromedriver.exe' + +# Application Parameters +do_breakpoints = boolean(default=False) +page_delay = integer(min=0, max=2000, default=0) + +# Web Driver Parameters +webdriver_user_agent = string(min=0, max=255, default='Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)') + +# Requested browser and webdriver dependencies are required for this to work. +# This means you need to have installed on your system: +# Chrome + WebDriver for Chrome +# Firefox + geckodriver for Firefox +# phantomjs for phantom +# Options are 'chrome', 'firefox', 'phantom' +webdriver_browser = string(min=1, max=63, default='chrome') + +# Wikipedia Parameters +page_base_url = string(min=1, max=255, default='https://www.wikipedia.org/') + +# Supported Languages so far: +# German, English, Spanish, French, Italian, Portuguese, Polish, Russian +# 'de', 'en', 'es', 'fr', 'it', 'pl', 'pt', 'ru' +page_language = string(min=2, max=22, default='en') + +# API Keys +# Yandex is a web REST API for translating between different languages. +yandex_api_key = string(min=0, max=1023, default='trnsl.1.1.20170825T194642Z.26862b9dd4c1a755.9490ed28de448ff67522c2854f262eff05ec0dc3') + +[logging] +log_file = string(max=255, default='') +log_level = option('critical', 'error', 'warning', 'info', 'debug', default='info') +log_fmt = string(max=255, default='') + + [[formatters]] + [[[colored]]] + () = string(default='colorlog.ColoredFormatter') + format = string(max=255, default='%(log_color)s%(levelname)-8s%(reset)s:%(log_color)s%(name)-5s%(reset)s:%(white)s%(message)s') + + [[[basic]]] + () = string(max=255, default='logging.Formatter') + format = string(max=255, default='%(levelname)s:%(name)s:%(asctime)s:%(message)s') + + [[[forstorage]]] + () = string(max=255, default='logging.Formatter') + format = string(max=255, default='%(levelname)s:%(name)s:%(asctime)s:%(message)s') + + [[handlers]] + [[[stderr]]] + class = string(max=255, default='logging.StreamHandler') + level = option('critical', 'error', 'warning', 'info', 'debug', default='debug') + formatter = string(max=255, default='colored') + + [[[file]]] + class = string(max=255, default='logging.handlers.RotatingFileHandler') + level = option('critical', 'error', 'warning', 'info', 'debug', default='warning') + formatter = string(max=255, default='forstorage') + filename = string(max=255, default='road2philosophy.log') + maxBytes = integer(min=0, max=33554432, default=33554432) + backupCount = integer(min=0, max=3, default=1) + + [[loggers]] + [[[root]]] + level = option('critical', 'error', 'warning', 'info', 'debug', default='debug') + handlers = string_list(max=8, default=list('file',) + + [[[r2p]]] + level = option('critical', 'error', 'warning', 'info', 'debug', default='debug') + handlers = string_list(max=8, default=list('stderr',)) + propagate = boolean(default=False) + diff --git a/wikicrawl/dal.py b/wikicrawl/dal.py index bf29ac0..ac81c4a 100644 --- a/wikicrawl/dal.py +++ b/wikicrawl/dal.py @@ -4,9 +4,7 @@ import sqlite3 import os -from . import config - class DataLayer: - def __init__(self): - pass + def __init__(self, ctx): + self.ctx = ctx diff --git a/wikicrawl/log.py b/wikicrawl/log.py deleted file mode 100644 index 4a14a36..0000000 --- a/wikicrawl/log.py +++ /dev/null @@ -1,39 +0,0 @@ -# log module is a wrapper around third-party colorlog library -# and provides an application-level interface to a logging system. - -import colorlog -import logging - -from . import config - -# Default python log severity levels: -# CRITICAL -# ERROR -# WARNING -# INFO -# DEBUG - -LOGGER = None - -class LoggingLayer: - def __init__(self, config): - self.loggers = {} - logging.config.dictConfig(config) - - # Note on __getitem__: - # __getitem__ overrides the functionality of the [] operator. - # That means this code: - # objinstance = LoggingLayer(...) - # objinstance[foo] calls LoggingLayer.__getitem__(foo) - # and returns the result. - def __getitem__(self, k): - logger = self.loggers.get(k) - if not logger: - logger = logging.getLogger(k) - self.loggers[k] = logger - return logger - -def init_logging(): - global LOGGER - LOGGER = LoggingLayer(config.obj.LOG_SETTINGS) - diff --git a/wikicrawl/main.py b/wikicrawl/main.py deleted file mode 100644 index 29cf839..0000000 --- a/wikicrawl/main.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python - -import sys - -from . import cli -from . import config -from . import util - -def main(): - user_interface = cli.InteractiveInterface() - - if len(sys.argv) > 1: # Command line arguments were passed in - # command-line when invoking python - user_interface.run_command(sys.argv) - else: - user_interface.start_command_loop() - -def verify_config_is_valid(): - pass - -if __name__ == '__main__': - main() - diff --git a/wikicrawl/model.py b/wikicrawl/model.py index 5be3f54..33455f3 100644 --- a/wikicrawl/model.py +++ b/wikicrawl/model.py @@ -9,15 +9,15 @@ import os import time -from . import browser -from . import config -from . import dal -from . import log from . import pages from . import util class Model: - def __init__(self): + def __init__(self, ctx, browser, dal): + self.ctx = ctx + self.browser = browser + self.dal = dal + self._webdriver = None self._translated_philosophy = None @@ -33,7 +33,7 @@ class Model: # always existing with or without knowing if it exists because if it # hasn't been created yet then it will be created on-the-fly. if not self._webdriver: - self._webdriver = browser.create_webdriver(config.obj.WEBDRIVER_BROWSER) + self._webdriver = self.browser.create_webdriver(self.ctx.config['app']['webdriver_browser']) page_api = pages.LandingPage(self.webdriver) page_api.goto_landing_page() return self._webdriver @@ -41,10 +41,10 @@ class Model: @property def translated_philosophy(self): # This translates 'philosophy' to the target language with only 1 api call. - if config.obj.PAGE_LANGUAGE == 'en': + if self.ctx.config['app']['page_language'] == 'en': self._translated_philosophy = 'philosophy' elif not self._translated_philosophy: - text = util.translate_text('en', config.obj.PAGE_LANGUAGE, 'philosophy') + text = util.translate_text('en', self.ctx.config['app']['page_language'], 'philosophy') self._translated_philosophy = text return self._translated_philosophy @@ -59,6 +59,19 @@ class Model: # creates it and then it is re-used later # in the application. + def play_multiple(self, n): + """ + Plays the wikicrawl game -times. + """ + try: + n = int(n) + except ValueError as ex: + self.ctx.log['cli'].warn('failed to process "%s" as a parameter' % n) + return False + for i in range(n): + self.model.play_random_page() + + def play_random_page(self): """ Select a random page and repeatedly click the first link until @@ -86,7 +99,7 @@ class Model: # In this case, we have made the language a parameter # that you can pass into the program, i.e. you can run it # for English or Spanish or Russian or what have you. - page_api.select_language(config.obj.PAGE_LANGUAGE) + page_api.select_language(self.ctx.config['app']['page_language']) # Main page: next 2 lines @@ -114,20 +127,20 @@ class Model: # Get the article title (and translate if necessary) title = page_api.get_title() - if config.obj.PAGE_LANGUAGE != 'en': - translated_title = util.translate_text(config.obj.PAGE_LANGUAGE, 'en', title) - log.LOGGER['model'].info('visited page: %s (%s)' % (title, translated_title)) + if self.ctx.config['app']['page_language'] != 'en': + translated_title = util.translate_text(self.ctx.config['app']['page_language'], 'en', title) + self.ctx.log['model'].info('visited page: %s (%s)' % (title, translated_title)) else: - log.LOGGER['model'].info('visited page: %s' % title) + self.ctx.log['model'].info('visited page: %s' % title) # Check for page loops (have we already visisted this page?) if title in pages_visited: - log.LOGGER['model'].info('encountered loop at page = %s' % title) + self.ctx.log['model'].info('encountered loop at page = %s' % title) break # Check if we reached the article on philosophy if self._is_article_on_philosophy(title, translated_title): - log.LOGGER['model'].info('made it to philosophy in %s pages' % len(pages_visited)) + self.ctx.log['model'].info('made it to philosophy in %s pages' % len(pages_visited)) pages_visited.append(title) break @@ -136,7 +149,7 @@ class Model: rc = page_api.click_first_link() if not rc: - log.LOGGER['model'].warn('failure: unable to continue (perhaps no valid links?)') + self.ctx.log['model'].warn('failure: unable to continue (perhaps no valid links?)') break print() diff --git a/wikicrawl/pages.py b/wikicrawl/pages.py index 865cdf3..c3cdb3f 100644 --- a/wikicrawl/pages.py +++ b/wikicrawl/pages.py @@ -2,15 +2,15 @@ # There are separate classes defined for each page with their own # defined methods for performing certain actions. +from . import browser +from . import util +from .assets.languages import LANGUAGES + import re import selenium import time -from . import browser -from . import config -from . import log -from . import util -from .assets.languages import LANGUAGES +PAGE_BASE_URL = 'https://www.wikipedia.org/' class PageRootObject: """ @@ -43,7 +43,8 @@ class PageRootObject: html link in the webpage. """ self.highlight(el, 'red') - time.sleep(config.obj.PAGE_DELAY) + DELAY=0 + time.sleep(DELAY) util.breakpoint() el.click() @@ -97,7 +98,7 @@ class LandingPage(PageRootObject): """ Navigates the browser to www.wikipedia.org """ - self.driver.get(config.obj.PAGE_BASE_URL) + self.driver.get(PAGE_BASE_URL) def select_language(self, language): lang_text = LANGUAGES.get(language) @@ -219,13 +220,13 @@ class ArticlePage(PageRootObject): if len(links) == 0: return False for link in links: - log.LOGGER['pages'].debug('processing link: %s' % link.text) + self.ctx.log['pages'].debug('processing link: %s' % link.text) if not self._is_valid_link(p, link): - log.LOGGER['pages'].debug('skipping link inside parenthesis: %s' % link.text) + self.ctx.log['pages'].debug('skipping link inside parenthesis: %s' % link.text) self.highlight(link, 'blue') continue self.highlight(link, 'red') - log.LOGGER['pages'].info('selected link: %s' % link.text) + self.ctx.log['pages'].info('selected link: %s' % link.text) self.click(link) return True diff --git a/wikicrawl/util.py b/wikicrawl/util.py index 4a87ff6..ceadaae 100644 --- a/wikicrawl/util.py +++ b/wikicrawl/util.py @@ -3,14 +3,14 @@ import yandex_translate -from . import config +DO_BREAKPOINTS = False def breakpoint(): """ If DO_BREAKPOINTS is switched on, this will pause program execution and wait for the user to press enter to continue. """ - if config.obj.DO_BREAKPOINTS: + if DO_BREAKPOINTS: input('BREAKPOINT hit. to continue...') def translate_text(source_language, target_language, text):