wikicrawl/app/model.py

65 lines
1.6 KiB
Python

import logging
import os
import time
from . import browser
from . import log
from . import dal
from . import pages
settings = {}
def init(settings_obj):
global settings
settings = settings_obj
browser.init(settings_obj)
dal.init(settings_obj)
pages.init(settings_obj)
log.init(settings_obj)
class Model:
def __init__(self):
self._webdriver = None
@property
def webdriver(self):
if not self._webdriver:
self._webdriver = browser.create_webdriver()
return self._webdriver
def do_random_page(self):
# Landing page (select language)
page_api = pages.LandingPage(self.webdriver)
page_api.goto_landing_page()
page_api.select_language(settings.PAGE_LANGUAGE)
# Main page
page_api = pages.MainPage(self.webdriver)
page_api.goto_random_article()
# Article page
pages_visited = []
while True:
page_api = pages.ArticlePage(self.webdriver)
title = page_api.get_title()
logging.debug('visited page: %s' % title)
if title in pages_visited:
logging.info('encountered loop at page = %s' % title)
break
if title == 'Philosophy':
logging.info('made it to philosophy in %s pages' % len(pages_visited))
pages_visited.append(title)
break
pages_visited.append(title)
rc = page_api.click_first_link()
if not rc:
logging.warn('failure: unable to continue (perhaps no valid links?)')
break
print()