157 lines
4.7 KiB
Python
157 lines
4.7 KiB
Python
|
import logging
|
||
|
import re
|
||
|
import selenium
|
||
|
import selenium.webdriver
|
||
|
|
||
|
settings = {}
|
||
|
|
||
|
def init(settings_obj):
|
||
|
global settings
|
||
|
settings = settings_obj
|
||
|
|
||
|
def breakpoint():
|
||
|
if settings.DO_BREAKPOINTS:
|
||
|
input('Breakpoint here. <Enter> to continue...')
|
||
|
|
||
|
class PageRootObject:
|
||
|
def __init__(self, driver=None):
|
||
|
if not driver:
|
||
|
self.driver = create_webdriver()
|
||
|
else:
|
||
|
self.driver = driver
|
||
|
|
||
|
def click(self, el):
|
||
|
self.highlight(el, 'red')
|
||
|
el.click()
|
||
|
|
||
|
def highlight(self, el, color):
|
||
|
if color == 'red':
|
||
|
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#ff9292'
|
||
|
elif color == 'blue':
|
||
|
js = 'arguments[0].setAttribute("style", "background: %s;font-weight:bold;color:#000")' % '#9292ff'
|
||
|
self.driver.execute_script(js, el)
|
||
|
|
||
|
class LandingPage(PageRootObject):
|
||
|
def __init__(self, driver=None):
|
||
|
super().__init__(driver)
|
||
|
|
||
|
def goto_landing_page(self):
|
||
|
self.driver.get(settings.PAGE_BASE_URL)
|
||
|
|
||
|
def select_language(self, language):
|
||
|
link = self.driver.find_element_by_partial_link_text(language)
|
||
|
self.click(link)
|
||
|
|
||
|
class MainPage(PageRootObject):
|
||
|
def __init__(self, driver=None):
|
||
|
super().__init__(driver)
|
||
|
|
||
|
def goto_random_article(self):
|
||
|
link = self.driver.find_element_by_partial_link_text('Random article')
|
||
|
self.click(link)
|
||
|
|
||
|
class ArticlePage(PageRootObject):
|
||
|
|
||
|
elements = {
|
||
|
'main-window-content-text-id': 'mw-content-text',
|
||
|
'article-title': 'firstHeading',
|
||
|
}
|
||
|
|
||
|
def __init__(self, driver=None):
|
||
|
super().__init__(driver)
|
||
|
|
||
|
def get_title(self):
|
||
|
heading = self.driver.find_element_by_id(ArticlePage.elements['article-title'])
|
||
|
return heading.text
|
||
|
|
||
|
def click_first_link(self):
|
||
|
return self._iterate_paragraphs()
|
||
|
|
||
|
def _iterate_paragraphs(self):
|
||
|
main_window = self.driver.find_element_by_id(ArticlePage.elements['main-window-content-text-id'])
|
||
|
paragraphs = main_window.find_elements_by_xpath('./div[contains(@class, "mw-parser-output")]/p[not(span/span[contains(@id, "coordinates")])]')
|
||
|
for p in paragraphs:
|
||
|
rc = self._parse_paragraph(p)
|
||
|
if rc:
|
||
|
return True
|
||
|
|
||
|
def _parse_paragraph(self, p):
|
||
|
links = p.find_elements_by_xpath('.//a')
|
||
|
if len(links) == 0:
|
||
|
return False
|
||
|
for link in links:
|
||
|
logging.debug('processing link: %s' % link.text)
|
||
|
if not self._is_valid_link(p, link):
|
||
|
logging.debug('skipping link inside parenthesis: %s' % link.text)
|
||
|
self.highlight(link, 'blue')
|
||
|
continue
|
||
|
self.highlight(link, 'red')
|
||
|
logging.info('selected link: %s' % link.text)
|
||
|
breakpoint()
|
||
|
link.click()
|
||
|
return True
|
||
|
|
||
|
def _is_valid_link(self, p, el):
|
||
|
a = self._is_link_in_parenthesis(p, el)
|
||
|
b = self._is_link_a_footnote(el)
|
||
|
c = self._is_link_pronounciation(el)
|
||
|
d = self._is_link_audio(el)
|
||
|
print(a, b, c, d)
|
||
|
if not a and not b and not c and not d:
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
def _is_link_in_parenthesis(self, p, el):
|
||
|
# link_text = el.text
|
||
|
link_text = el.get_attribute('outerHTML')
|
||
|
p_text = p.get_attribute('innerHTML')
|
||
|
|
||
|
regex_str = '\(.*?\)'
|
||
|
regex = re.compile(regex_str, flags=re.UNICODE)
|
||
|
match = regex.search(p_text)
|
||
|
if not match:
|
||
|
return False
|
||
|
|
||
|
while match is not None:
|
||
|
match_text = match.group(0)
|
||
|
match_idx = match.end(0)
|
||
|
print(link_text)
|
||
|
print(match_text)
|
||
|
if link_text in match_text:
|
||
|
return True
|
||
|
|
||
|
match = regex.search(p_text, match_idx+1)
|
||
|
|
||
|
# Is the link inside parenthesis?
|
||
|
# regex_str = '\([^()]*<a.*?>%s</a>[^())]*\)' % re.escape(link_text)
|
||
|
# regex_str = '\(.*<a.*?>%s.*\)' % link_text
|
||
|
# print(regex_str)
|
||
|
# regex = re.compile(regex_str, flags=re.UNICODE)
|
||
|
# match = re.search(regex_str, p_text)
|
||
|
# if match: # Pattern is found in the text
|
||
|
# print(match.group(0))
|
||
|
# return True
|
||
|
# else:
|
||
|
# return False
|
||
|
|
||
|
def _is_link_a_footnote(self, el):
|
||
|
href = el.get_attribute('href')
|
||
|
if '#cite_note' in href:
|
||
|
return True
|
||
|
if '#cnote' in href:
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
def _is_link_pronounciation(self, el):
|
||
|
href = el.get_attribute('href')
|
||
|
if '/wiki/Help:IPA' in href:
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
def _is_link_audio(self, el):
|
||
|
href = el.get_attribute('href')
|
||
|
if '.ogg' in href:
|
||
|
return True
|
||
|
return False
|
||
|
|