1 |
douglas |
1131 |
# Website |
2 |
|
|
# |
3 |
|
|
# Douglas Thrift |
4 |
|
|
# |
5 |
|
|
# $Id$ |
6 |
|
|
|
7 |
douglas |
1142 |
from BeautifulSoup import MinimalSoup |
8 |
|
|
import copy |
9 |
douglas |
1131 |
import mechanize |
10 |
douglas |
1142 |
import re |
11 |
douglas |
1131 |
|
12 |
|
|
# XXX: hack to make Wells Fargo http-equiv redirects actually work |
13 |
|
|
mechanize._http.AbstractHeadParser.head_elems = tuple(list(mechanize._http.AbstractHeadParser.head_elems) + ['body']) |
14 |
|
|
|
15 |
douglas |
1142 |
class DuckSoup(MinimalSoup): |
16 |
|
|
MARKUP_MASSAGE = copy.copy(MinimalSoup.MARKUP_MASSAGE) |
17 |
|
|
|
18 |
|
|
MARKUP_MASSAGE.append((re.compile(r'''</?[a-z]+['"]\+['"][a-z]+'''), lambda match: '')) |
19 |
|
|
|
20 |
douglas |
1131 |
class Website(object): |
21 |
douglas |
1142 |
Soup = DuckSoup |
22 |
douglas |
1131 |
|
23 |
|
|
def __init__(self, debug): |
24 |
|
|
self.browser = mechanize.Browser() |
25 |
|
|
self.browser.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.25 Safari/525.19')] |
26 |
|
|
|
27 |
|
|
if debug: |
28 |
|
|
self.browser.set_debug_http(True) |
29 |
|
|
|
30 |
|
|
self.browser.set_handle_equiv(True) |
31 |
|
|
self.browser.set_handle_redirect(True) |
32 |
|
|
self.browser.set_handle_refresh(True) |
33 |
|
|
self.browser.set_handle_robots(False) |