diff --git a/settings.py b/settings.py index 767b009..31ee594 100644 --- a/settings.py +++ b/settings.py @@ -91,7 +91,7 @@ class DefaultSettings: 'model': { 'level': logging.DEBUG, 'handlers': ['stderr'], - 'propagate': True + 'propagate': True # Send to root logger }, 'cli': { 'level': logging.DEBUG, diff --git a/wikicrawl/cli.py b/wikicrawl/cli.py index b2890ef..e14a06e 100644 --- a/wikicrawl/cli.py +++ b/wikicrawl/cli.py @@ -6,7 +6,6 @@ import baker import sys - if sys.platform == 'linux': import readline # Needed for command history and arrows to work @@ -56,7 +55,7 @@ class InteractiveInterface: commander.usage() except Exception as ex: log.LOGGER['cli'].error('caught general exception!!') - log.LOGGER['cli'].error(type(ex), ex) + log.LOGGER['cli'].error(ex) def start_command_loop(self): """ @@ -170,6 +169,10 @@ class InteractiveInterface: self.run_command(args, main=False) + @commander.command + def play_specific_page(self, title): + pass + @commander.command def play_random_page(self): """ diff --git a/wikicrawl/dal.py b/wikicrawl/dal.py index 84d7cbe..a6e07f7 100644 --- a/wikicrawl/dal.py +++ b/wikicrawl/dal.py @@ -1,3 +1,6 @@ +# Nothing is implemented here yet... This is intended to be a future +# excercise. + import sqlite3 import pycurl import os diff --git a/wikicrawl/pages.py b/wikicrawl/pages.py index bae4b23..f4bad9b 100644 --- a/wikicrawl/pages.py +++ b/wikicrawl/pages.py @@ -221,6 +221,8 @@ class ArticlePage(PageRootObject): return False if self._is_link_audio(el): return False + if self._is_not_wikipedia(el): + return False return True def _is_link_in_parenthesis(self, p, el): @@ -299,3 +301,9 @@ class ArticlePage(PageRootObject): return True return False + def _is_not_wikipedia(self, el): + href = el.get_attribute('href') + if 'wikipedia.org' not in href: + return True + return False +