对GETCHU使用指定编码"EUC-JP"(为GETCHU响应Header中指定的编码)
This commit is contained in:
@@ -3,6 +3,8 @@
|
||||
import re
|
||||
import json
|
||||
from urllib.parse import quote
|
||||
|
||||
from scrapinglib import httprequest
|
||||
from .parser import Parser
|
||||
|
||||
|
||||
@@ -64,6 +66,18 @@ class wwwGetchu(Parser):
|
||||
return None
|
||||
return detailurl.replace('../', 'http://www.getchu.com/')
|
||||
|
||||
def getHtml(self, url, type = None):
|
||||
""" 访问网页(指定EUC-JP)
|
||||
"""
|
||||
resp = httprequest.get(url, cookies=self.cookies, proxies=self.proxies, extra_headers=self.extraheader, encoding='euc-jp', verify=self.verify, return_type=type)
|
||||
if '<title>404 Page Not Found' in resp \
|
||||
or '<title>未找到页面' in resp \
|
||||
or '404 Not Found' in resp \
|
||||
or '<title>404' in resp \
|
||||
or '<title>お探しの商品が見つかりません' in resp:
|
||||
return 404
|
||||
return resp
|
||||
|
||||
def getNum(self, htmltree):
|
||||
return 'GETCHU-' + re.findall('\d+', self.detailurl.replace("http://www.getchu.com/soft.phtml?id=", ""))[0]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user