1 |
# Website |
2 |
# |
3 |
# Douglas Thrift |
4 |
# |
5 |
# $Id$ |
6 |
|
7 |
from BeautifulSoup import MinimalSoup |
8 |
import copy |
9 |
import mechanize |
10 |
import re |
11 |
|
12 |
# XXX: hack to make Wells Fargo http-equiv redirects actually work |
13 |
mechanize._http.AbstractHeadParser.head_elems = tuple(list(mechanize._http.AbstractHeadParser.head_elems) + ['body']) |
14 |
mechanize._http.HTTPRedirectHandler.max_redirections = 20 |
15 |
|
16 |
class DuckSoup(MinimalSoup): |
17 |
MARKUP_MASSAGE = copy.copy(MinimalSoup.MARKUP_MASSAGE) |
18 |
|
19 |
MARKUP_MASSAGE.append((re.compile(r'''</?[a-z]+['"]\+['"][a-z]+'''), lambda match: '')) |
20 |
|
21 |
class Factory(mechanize.DefaultFactory): |
22 |
def __init__(self, form_parser): |
23 |
mechanize.DefaultFactory.__init__(self) |
24 |
|
25 |
self._forms_factory = mechanize.FormsFactory(form_parser_class = form_parser) |
26 |
|
27 |
class Website(object): |
28 |
Soup = DuckSoup |
29 |
|
30 |
def __init__(self, debug, *args, **kwargs): |
31 |
self.browser = mechanize.Browser(*args, **kwargs) |
32 |
self.browser.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.31 Safari/530.5')] |
33 |
|
34 |
if debug: |
35 |
self.browser.set_debug_http(True) |
36 |
|
37 |
self.browser.set_handle_equiv(True) |
38 |
self.browser.set_handle_redirect(True) |
39 |
self.browser.set_handle_refresh(True) |
40 |
self.browser.set_handle_robots(False) |
41 |
|
42 |
self._count = 0 |
43 |
|
44 |
def _back(self): |
45 |
if self._count: |
46 |
result = self.browser.back(self._count) |
47 |
|
48 |
self._count = 0 |
49 |
|
50 |
return result |
51 |
|
52 |
def _follow_link(self, *args, **kwargs): |
53 |
result = self.browser.follow_link(*args, **kwargs) |
54 |
|
55 |
self._count += 1 |
56 |
|
57 |
return result |
58 |
|
59 |
def _submit(self, *args, **kwargs): |
60 |
result = self.browser.submit(*args, **kwargs) |
61 |
|
62 |
self._count += 1 |
63 |
|
64 |
return result |