1 |
# Website |
2 |
# |
3 |
# Douglas Thrift |
4 |
# |
5 |
# $Id$ |
6 |
|
7 |
from BeautifulSoup import MinimalSoup |
8 |
import copy |
9 |
import mechanize |
10 |
import re |
11 |
|
12 |
# XXX: hack to make Wells Fargo http-equiv redirects actually work |
13 |
mechanize._http.AbstractHeadParser.head_elems = tuple(list(mechanize._http.AbstractHeadParser.head_elems) + ['body']) |
14 |
mechanize._http.HTTPRedirectHandler.max_redirections = 20 |
15 |
|
16 |
class DuckSoup(MinimalSoup): |
17 |
MARKUP_MASSAGE = copy.copy(MinimalSoup.MARKUP_MASSAGE) |
18 |
|
19 |
MARKUP_MASSAGE.append((re.compile(r'''</?[a-z]+['"]\+['"][a-z]+'''), lambda match: '')) |
20 |
|
21 |
class Website(object): |
22 |
Soup = DuckSoup |
23 |
|
24 |
def __init__(self, debug): |
25 |
self.browser = mechanize.Browser() |
26 |
self.browser.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.25 Safari/525.19')] |
27 |
|
28 |
if debug: |
29 |
self.browser.set_debug_http(True) |
30 |
|
31 |
self.browser.set_handle_equiv(True) |
32 |
self.browser.set_handle_redirect(True) |
33 |
self.browser.set_handle_refresh(True) |
34 |
self.browser.set_handle_robots(False) |