From 82315472f66794f5f9c21537f94ec27e3bb2b403 Mon Sep 17 00:00:00 2001 From: Yoshiko2 <42309414+yoshiko2@users.noreply.github.com> Date: Fri, 17 Jul 2020 15:06:42 +0800 Subject: [PATCH] Upate 3.5.1 --- config.py | 6 +++++- javdb.py | 19 ++++++++++++++----- number_parser.py | 2 +- test.py | 5 +++++ 4 files changed, 25 insertions(+), 7 deletions(-) create mode 100644 test.py diff --git a/config.py b/config.py index bdbebe0..b912477 100644 --- a/config.py +++ b/config.py @@ -1,12 +1,16 @@ import os import configparser +import codecs class Config: def __init__(self, path: str = "config.ini"): if os.path.exists(path): self.conf = configparser.ConfigParser() - self.conf.read(path, encoding="utf-8") + try: + self.conf.read(path, encoding="utf-8-sig") + except: + self.conf.read(path, encoding="utf-8") else: print("[-] Config file not found! Use the default settings") self.conf = self._default_config() diff --git a/javdb.py b/javdb.py index ce19601..6a0ccbd 100755 --- a/javdb.py +++ b/javdb.py @@ -68,13 +68,22 @@ def getCover_small(a, index=0): # javdb sometime returns multiple results # DO NOT just get the firt one, get the one with correct index number html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] - if not 'https' in result: - result = 'https:' + result - return result + try: + result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] + if not 'https' in result: + result = 'https:' + result + return result + except: # 2020.7.17 Repair Cover Url crawl + result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index] + if not 'https' in result: + result = 'https:' + result + return result def getCover(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")).strip(" ['']") + try: + result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0] + except: # 2020.7.17 Repair Cover Url crawl + result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0] return result def getDirector(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() diff --git a/number_parser.py b/number_parser.py index bbbbe0d..d9e35f8 100644 --- a/number_parser.py +++ b/number_parser.py @@ -45,4 +45,4 @@ def get_number(filepath: str) -> str: if __name__ == "__main__": import doctest - doctest.testmod(raise_on_error=True) + doctest.testmod(raise_on_error=True) \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000..930541c --- /dev/null +++ b/test.py @@ -0,0 +1,5 @@ +## 2020.6.22 更新 +* 改进:网站爬虫子程序参数混乱 +* 修复:命名规则release参数带```/```的问题 +* 新增:socks5本地代理连接 +* 新增:命名规则series参数 \ No newline at end of file