wikicrawl/app/pages.py

157 lines
4.7 KiB
Python
Raw Normal View History

2017-08-17 01:27:05 -06:00
import logging
import re
import selenium
import selenium.webdriver
settings = {}
def init(settings_obj):
global settings
settings = settings_obj
def breakpoint():
if settings.DO_BREAKPOINTS:
input('Breakpoint here. <Enter> to continue...')
class PageRootObject:
def __init__(self, driver=None):
if not driver:
self.driver = create_webdriver()
else:
self.driver = driver
def click(self, el):
self.highlight(el, 'red')
el.click()
def highlight(self, el, color):
if color == 'red':
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292'
elif color == 'blue':
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
self.driver.execute_script(js, el)
class LandingPage(PageRootObject):
def __init__(self, driver=None):
super().__init__(driver)
def goto_landing_page(self):
self.driver.get(settings.PAGE_BASE_URL)
def select_language(self, language):
link = self.driver.find_element_by_partial_link_text(language)
self.click(link)
class MainPage(PageRootObject):
def __init__(self, driver=None):
super().__init__(driver)
def goto_random_article(self):
link = self.driver.find_element_by_partial_link_text('Random article')
self.click(link)
class ArticlePage(PageRootObject):
elements = {
'main-window-content-text-id': 'mw-content-text',
'article-title': 'firstHeading',
}
def __init__(self, driver=None):
super().__init__(driver)
def get_title(self):
heading = self.driver.find_element_by_id(ArticlePage.elements['article-title'])
return heading.text
def click_first_link(self):
return self._iterate_paragraphs()
def _iterate_paragraphs(self):
main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id'])
paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]')
for p in paragraphs:
rc = self._parse_paragraph(p)
if rc:
return True
def _parse_paragraph(self, p):
links = p.find_elements_by_xpath('.//a')
if len(links) == 0:
return False
for link in links:
logging.debug('processing link: %s' % link.text)
if not self._is_valid_link(p, link):
logging.debug('skipping link inside parenthesis: %s' % link.text)
self.highlight(link, 'blue')
continue
self.highlight(link, 'red')
logging.info('selected link: %s' % link.text)
breakpoint()
link.click()
return True
def _is_valid_link(self, p, el):
a = self._is_link_in_parenthesis(p, el)
b = self._is_link_a_footnote(el)
c = self._is_link_pronounciation(el)
d = self._is_link_audio(el)
print(a, b, c, d)
if not a and not b and not c and not d:
return True
return False
def _is_link_in_parenthesis(self, p, el):
# link_text = el.text
link_text = el.get_attribute('outerHTML')
p_text = p.get_attribute('innerHTML')
regex_str = '\(.*?\)'
regex = re.compile(regex_str, flags=re.UNICODE)
match = regex.search(p_text)
if not match:
return False
while match is not None:
match_text = match.group(0)
match_idx = match.end(0)
print(link_text)
print(match_text)
if link_text in match_text:
return True
match = regex.search(p_text, match_idx+1)
# Is the link inside parenthesis?
# regex_str = '\([^()]*<a.*?>%s</a>[^())]*\)' % re.escape(link_text)
# regex_str = '\(.*<a.*?>%s.*\)' % link_text
# print(regex_str)
# regex = re.compile(regex_str, flags=re.UNICODE)
# match = re.search(regex_str, p_text)
# if match: # Pattern is found in the text
# print(match.group(0))
# return True
# else:
# return False
def _is_link_a_footnote(self, el):
href = el.get_attribute('href')
if '#cite_note' in href:
return True
if '#cnote' in href:
return True
return False
def _is_link_pronounciation(self, el):
href = el.get_attribute('href')
if '/wiki/Help:IPA' in href:
return True
return False
def _is_link_audio(self, el):
href = el.get_attribute('href')
if '.ogg' in href:
return True
return False