WebCrawler:全面换装getInstance()，厘清airav.py与javbus.py及javdb.py的相爱相杀

2021-10-08 11:46:35 +08:00
parent cf072e79d1
commit a405c5c41b
11 changed files with 206 additions and 167 deletions
--- a/WebCrawler/init.py
+++ b/WebCrawler/init.py
@@ -134,6 +134,14 @@ def get_data_from_json(file_number, conf: config.Config):  # 从JSON返回元数
        print('[-]Movie Number not found!')
        return None

+    # 增加number严格判断，避免提交任何number，总是返回"本橋実来 ADZ335"，这种返回number不一致的数据源故障
+    # 目前选用number命名规则是javdb.com Domain Creation Date: 2013-06-19T18:34:27Z
+    # 然而也可以跟进关注其它命名规则例如airav.wiki Domain Creation Date: 2019-08-28T07:18:42.0Z
+    # 如果将来javdb.com命名规则下不同Studio出现同名碰撞导致无法区分，可考虑更换规则，更新相应的number分析和抓取代码。
+    if str(json_data.get('number')).upper() != file_number.upper():
+        print('[-]Movie number has changed! [{}]->[{}]'.format(file_number, str(json_data.get('number'))))
+        return None
+
    # ================================================网站规则添加结束================================================

    title = json_data.get('title')
@@ -225,6 +233,8 @@ def get_data_from_json(file_number, conf: config.Config):  # 从JSON返回元数
    studio = studio.replace('エムズビデオグループ','M’s Video Group')
    studio = studio.replace('ミニマム','Minimum')
    studio = studio.replace('ワープエンタテインメント','WAAP Entertainment')
+    studio = studio.replace('pacopacomama,パコパコママ','pacopacomama')
+    studio = studio.replace('パコパコママ','pacopacomama')
    studio = re.sub('.*/妄想族','妄想族',studio)
    studio = studio.replace('/',' ')
    # ===  替换Studio片假名 END
--- a/WebCrawler/airav.py
+++ b/WebCrawler/airav.py
@@ -6,6 +6,7 @@ from lxml import etree#need install
 from bs4 import BeautifulSoup#need install
 import json
 from ADC_function import *
+from WebCrawler import javbus

 '''
 API
@@ -17,95 +18,94 @@ API
 host = 'https://www.airav.wiki'

 # airav这个网站没有演员图片，所以直接使用javbus的图
-def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
-    soup = BeautifulSoup(htmlcode, 'lxml')
-    a = soup.find_all(attrs={'class': 'star-name'})
-    d={}
-    for i in a:
-        l=i.a['href']
-        t=i.get_text()
-        html = etree.fromstring(get_html(l), etree.HTMLParser())
-        p=urljoin("https://www.javbus.com",
-                  str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']"))
-        p2={t:p}
-        d.update(p2)
-    return d
+def getActorPhoto(javbus_json):
+    result = javbus_json.get('actor_photo')
+    if isinstance(result, dict) and len(result):
+        return result
+    return ''

 def getTitle(htmlcode):  #获取标题
-    doc = pq(htmlcode)
-    # h5:first-child定位第一个h5标签，妈的找了好久才找到这个语法
-    title = str(doc('div.d-flex.videoDataBlock h5.d-none.d-md-block:nth-child(2)').text()).replace(' ', '-')
-    try:
-        title2 = re.sub('n\d+-','',title)
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    title = str(html.xpath('/html/head/title/text()')[0])
+    result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip()
+    return result

-        return title2
+def getStudio(htmlcode, javbus_json): #获取厂商 已修改
+    # javbus如果有数据以它为准
+    result = javbus_json.get('studio')
+    if isinstance(result, str) and len(result):
+        return result
+    html = etree.fromstring(htmlcode,etree.HTMLParser())
+    return str(html.xpath('//a[contains(@href,"?video_factory=")]/text()')).strip(" ['']")
+def getYear(htmlcode, javbus_json):   #获取年份
+    result = javbus_json.get('year')
+    if isinstance(result, str) and len(result):
+        return result
+    release = getRelease(htmlcode, javbus_json)
+    if len(release) != len('2000-01-01'):
+        return ''
+    return release[:4]
+def getCover(htmlcode, javbus_json):  #获取封面图片
+    result = javbus_json.get('cover')
+    if isinstance(result, str) and len(result):
+        return result
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    return html.xpath('//img[contains(@src,"/storage/big_pic/")]/@src')[0]
+def getRelease(htmlcode, javbus_json): #获取出版日期
+    result = javbus_json.get('release')
+    if isinstance(result, str) and len(result):
+        return result
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    try:
+        result = re.search(r'\d{4}-\d{2}-\d{2}', str(html.xpath('//li[contains(text(),"發片日期")]/text()'))).group()
    except:
-        return title
-
-def getStudio(htmlcode): #获取厂商 已修改
-    html = etree.fromstring(htmlcode,etree.HTMLParser())
-    # 如果记录中冇导演，厂商排在第4位
-    if '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"):
-        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
-    # 如果记录中有导演，厂商排在第5位
-    elif '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/span/text()')).strip(" ['']"):
-        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']")
-    else:
-        result = ''
+        return ''
    return result
-def getYear(htmlcode):   #获取年份
-    html = etree.fromstring(htmlcode,etree.HTMLParser())
-    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
-    return result
-def getCover(htmlcode):  #获取封面链接
-    doc = pq(htmlcode)
-    image = doc('a.bigImage')
-    return urljoin("https://www.javbus.com", image.attr('href'))
-def getRelease(htmlcode): #获取出版日期
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
-    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
-    return result
-def getRuntime(htmlcode): #获取分钟 已修改
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
-    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘")
-    return result
-def getActor(htmlcode):   #获取女优
+def getRuntime(javbus_json): #获取播放时长
+    result = javbus_json.get('runtime')
+    if isinstance(result, str) and len(result):
+        return result
+    return ''
+# airav女优数据库较多日文汉字姓名，javbus较多日语假名，因此airav优先
+def getActor(htmlcode, javbus_json):   #获取女优
    b=[]
-    soup=BeautifulSoup(htmlcode,'lxml')
-    a=soup.find_all(attrs={'class':'star-name'})
-    for i in a:
-        b.append(i.get_text())
-    return b
-def getNum(htmlcode):     #获取番号
    html = etree.fromstring(htmlcode, etree.HTMLParser())
-    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
-    return result
-def getDirector(htmlcode): #获取导演 已修改
+    a = html.xpath('//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()')
+    for v in a:
+        v = v.strip()
+        if len(v):
+            b.append(v)
+    if len(b):
+        return b
+    result = javbus_json.get('actor')
+    if isinstance(result, list) and len(result):
+        return result
+    return []
+def getNum(htmlcode, javbus_json):     #获取番号
+    result = javbus_json.get('number')
+    if isinstance(result, str) and len(result):
+        return result
    html = etree.fromstring(htmlcode, etree.HTMLParser())
-    if '導演:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"):
-        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
-    else:
-        result = ''         # 记录中有可能没有导演数据
+    title = str(html.xpath('/html/head/title/text()')[0])
+    result = str(re.findall('^\[(.*?)]', title)[0])
    return result
-
-def getOutline(htmlcode):  #获取演员
+def getDirector(javbus_json): #获取导演 已修改
+    result = javbus_json.get('director')
+    if isinstance(result, str) and len(result):
+        return result
+    return ''
+def getOutline(htmlcode):  #获取概述
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    try:
-        result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','')
+        result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','').strip()
        return result
    except:
        return ''
-def getSerise(htmlcode):   #获取系列 已修改
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
-    # 如果记录中冇导演，系列排在第6位
-    if '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/span/text()')).strip(" ['']"):
-        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/a/text()')).strip(" ['']")
-    # 如果记录中有导演，系列排在第7位
-    elif '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/span/text()')).strip(" ['']"):
-        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
-    else:
-        result = ''
-    return result
+def getSerise(javbus_json):   #获取系列 已修改
+    result = javbus_json.get('series')
+    if isinstance(result, str) and len(result):
+        return result
+    return ''
 def getTag(htmlcode):  # 获取标签
    tag = []
    soup = BeautifulSoup(htmlcode, 'lxml')
@@ -169,52 +169,50 @@ def main(number):
    try:
        try:
            htmlcode = get_html('https://cn.airav.wiki/video/' + number)
-            javbus_htmlcode = get_html('https://www.javbus.com/ja/' + number)
-
+            javbus_json = json.loads(javbus.main(number))

        except:
            print(number)

        dic = {
            # 标题可使用airav
-            'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))),
-            # 制作商选择使用javbus
-            'studio': getStudio(javbus_htmlcode),
-            # 年份也是用javbus
-            'year': str(re.search('\d{4}', getYear(javbus_htmlcode)).group()),
+            'title': getTitle(htmlcode),
+            # 制作商先找javbus，如果没有再找本站
+            'studio': getStudio(htmlcode, javbus_json),
+            # 年份先试javbus，如果没有再找本站
+            'year': getYear(htmlcode, javbus_json),
            #  简介 使用 airav
            'outline': getOutline(htmlcode),
            # 使用javbus
-            'runtime': getRuntime(javbus_htmlcode),
+            'runtime': getRuntime(javbus_json),
            # 导演 使用javbus
-            'director': getDirector(javbus_htmlcode),
-            # 作者 使用airav
-            'actor': getActor(javbus_htmlcode),
-            # 发售日使用javbus
-            'release': getRelease(javbus_htmlcode),
+            'director': getDirector(javbus_json),
+            # 演员 先试airav
+            'actor': getActor(htmlcode, javbus_json),
+            # 发售日先试javbus
+            'release': getRelease(htmlcode, javbus_json),
            # 番号使用javbus
-            'number': getNum(javbus_htmlcode),
+            'number': getNum(htmlcode, javbus_json),
            # 封面链接 使用javbus
-            'cover': getCover(javbus_htmlcode),
+            'cover': getCover(htmlcode, javbus_json),
            # 剧照获取
            'extrafanart': getExtrafanart(htmlcode),
            'imagecut': 1,
            # 使用 airav
            'tag': getTag(htmlcode),
            # 使用javbus
-            'label': getSerise(javbus_htmlcode),
+            'label': getSerise(javbus_json),
            # 妈的，airav不提供作者图片
-            'actor_photo': getActorPhoto(javbus_htmlcode),
-
+#            'actor_photo': getActorPhoto(javbus_json),
            'website': 'https://www.airav.wiki/video/' + number,
            'source': 'airav.py',
            # 使用javbus
-            'series': getSerise(javbus_htmlcode),
+            'series': getSerise(javbus_json)
        }
        js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), )  # .encode('UTF-8')
        return js
    except Exception as e:
-        if config.Config().debug():
+        if config.getInstance().debug():
            print(e)
        data = {
            "title": "",
@@ -226,6 +224,6 @@ def main(number):


 if __name__ == '__main__':
-    #print(main('ADN-188'))
-    print(main('ADN-188'))
-    print(main('CJOD-278'))
+    print(main('ADV-R0624'))  # javbus页面返回404, airav有数据
+    print(main('ADN-188'))    # 一人
+    print(main('CJOD-278'))   # 多人 javbus演员名称采用日语假名，airav采用日文汉字
--- a/WebCrawler/avsox.py
+++ b/WebCrawler/avsox.py
@@ -100,6 +100,9 @@ def main(number):
    soup = BeautifulSoup(web, 'lxml')
    info = str(soup.find(attrs={'class': 'row movie'}))
    try:
+        new_number = getNum(info)
+        if new_number.upper() != number.upper():
+            raise ValueError('number not found')
        dic = {
            'actor': getActor(web),
            'title': getTitle(web).strip(getNum(web)),
@@ -108,7 +111,7 @@ def main(number):
            'runtime': getRuntime(info),
            'director': '',  #
            'release': getRelease(info),
-            'number': getNum(info),
+            'number': new_number,
            'cover': getCover(web),
            'cover_small': getCover_small(a),
            'imagecut': 3,
@@ -121,7 +124,7 @@ def main(number):
            'series': getSeries(info),
        }
    except Exception as e:
-        if config.Config().debug():
+        if config.getInstance().debug():
            print(e)
        dic = {"title": ""}
    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
@@ -129,3 +132,4 @@ def main(number):

 if __name__ == "__main__":
    print(main('012717_472'))
+    print(main('1')) # got fake result raise 'number not found'
--- a/WebCrawler/carib.py
+++ b/WebCrawler/carib.py
@@ -1,51 +1,53 @@
 import sys
 sys.path.append('../')
 import json
-from bs4 import BeautifulSoup
 from lxml import html
 import re
 from ADC_function import *

 def main(number: str) -> json:
    try:
-        caribbytes, browser = get_html_by_browser(
+        carib_obj, browser = get_html_by_browser(
            'https://www.caribbeancom.com/moviepages/'+number+'/index.html',
            return_type="browser")

-        if not caribbytes or not caribbytes.ok:
+        if not carib_obj or not carib_obj.ok:
            raise ValueError("page not found")

        lx = html.fromstring(str(browser.page))

        if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
            raise ValueError("page info not found")
+
+        dic = {
+            'title': get_title(lx),
+            'studio': '加勒比',
+            'year': get_year(lx),
+            'outline': get_outline(lx),
+            'runtime': get_runtime(lx),
+            'director': '',
+            'actor': get_actor(lx),
+            'release': get_release(lx),
+            'number': number,
+            'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
+            'tag': get_tag(lx),
+            'extrafanart': get_extrafanart(lx),
+            'label': get_series(lx),
+            'imagecut': 1,
+#            'actor_photo': get_actor_photo(browser),
+            'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
+            'source': 'carib.py',
+            'series': get_series(lx),
+        }
+        js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
+        return js
+
    except Exception as e:
-        if config.Config().debug():
+        if config.getInstance().debug():
            print(e)
        dic = {"title": ""}
        return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
-    dic = {
-        'title': get_title(lx),
-        'studio': '加勒比',
-        'year': get_year(lx),
-        'outline': get_outline(lx),
-        'runtime': get_runtime(lx),
-        'director': '',
-        'actor': get_actor(lx),
-        'release': get_release(lx),
-        'number': number,
-        'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
-        'tag': get_tag(lx),
-        'extrafanart': get_extrafanart(lx),
-        'label': get_series(lx),
-        'imagecut': 1,
-#        'actor_photo': get_actor_photo(browser),
-        'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
-        'source': 'carib.py',
-        'series': get_series(lx),
-    }
-    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
-    return js
+

 def get_title(lx: html.HtmlElement) -> str:
    return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip()
@@ -114,11 +116,10 @@ def get_actor_photo(browser):
        if pos<0:
            continue
        css = html[pos:pos+100]
-        p0 = css.find('background: url(')
-        p1 = css.find('.jpg)')
-        if p0<0 or p1<0:
+        cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
+        if not cssBGjpgs or not len(cssBGjpgs[0]):
            continue
-        p = {k: urljoin(browser.url, css[p0+len('background: url('):p1+len('.jpg')])}
+        p = {k: urljoin(browser.url, cssBGjpgs[0])}
        o.update(p)
    return o

--- a/WebCrawler/dlsite.py
+++ b/WebCrawler/dlsite.py
@@ -153,7 +153,7 @@ def main(number):
        js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
        return js
    except Exception as e:
-        if config.Config().debug():
+        if config.getInstance().debug():
            print(e)
        data = {
            "title": "",
--- a/WebCrawler/fc2.py
+++ b/WebCrawler/fc2.py
@@ -93,10 +93,11 @@ def main(number):
            actor = '素人'
        lx = etree.fromstring(htmlcode2, etree.HTMLParser())
        cover = str(lx.xpath("//div[@class='items_article_MainitemThumb']/span/img/@src")).strip(" ['']")
+        cover = ADC_function.urljoin('https://adult.contents.fc2.com', cover)
        dic = {
            'title': lx.xpath('/html/head/title/text()')[0],
            'studio': getStudio_fc2com(htmlcode2),
-            'year': getYear_fc2com(getRelease_fc2com(htmlcode2)),   
+            'year': getYear_fc2com(getRelease_fc2com(htmlcode2)),
            'outline': '',  # getOutline_fc2com(htmlcode2),
            'runtime': str(lx.xpath("//p[@class='items_article_info']/text()")[0]),
            'director': getStudio_fc2com(htmlcode2),
@@ -116,7 +117,7 @@ def main(number):
            'series': '',
        }
    except Exception as e:
-        if ADC_function.config.Config().debug():
+        if ADC_function.config.getInstance().debug():
            print(e)
        dic = {"title": ""}
    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
@@ -124,4 +125,5 @@ def main(number):

 if __name__ == '__main__':
    print(main('FC2-1787685'))
+    print(main('FC2-2086710'))

--- a/WebCrawler/fc2club.py
+++ b/WebCrawler/fc2club.py
@@ -84,7 +84,7 @@ def main(number):
        dic = {
            'title': getTitle_fc2com(htmlcode2),
            'studio': getStudio_fc2com(htmlcode2),
-            'year': getYear_fc2com(getRelease_fc2com(htmlcode2)),   
+            'year': getYear_fc2com(getRelease_fc2com(htmlcode2)),
            'outline': '',  # getOutline_fc2com(htmlcode2),
            'runtime': '',
            'director': getStudio_fc2com(htmlcode2),
@@ -103,7 +103,7 @@ def main(number):
            'series': '',
        }
    except Exception as e:
-        if ADC_function.config.Config().debug():
+        if ADC_function.config.getInstance().debug():
            print(e)
        dic = {"title": ""}
    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
--- a/WebCrawler/javbus.py
+++ b/WebCrawler/javbus.py
@@ -6,8 +6,7 @@ from lxml import etree#need install
 from bs4 import BeautifulSoup#need install
 import json
 from ADC_function import *
-from WebCrawler import fanza
-from WebCrawler import airav
+import inspect

 def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
    soup = BeautifulSoup(htmlcode, 'lxml')
@@ -82,12 +81,16 @@ def getCID(htmlcode):
    result = re.sub('/.*?.jpg','',string)
    return result
 def getOutline(number):  #获取剧情介绍
+    if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
+        return ''   # 从airav.py过来的调用不计算outline直接返回，避免重复抓取数据拖慢处理速度
    try:
-        response = json.loads(airav.main(number))
-        result = response['outline']
+        htmlcode = get_html('https://cn.airav.wiki/video/' + number)
+        from WebCrawler.airav import getOutline as airav_getOutline
+        result = airav_getOutline(htmlcode)
        return result
    except:
-        return ''
+        pass
+    return ''
 def getSerise(htmlcode):   #获取系列 已修改
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    # 如果记录中冇导演，系列排在第6位
@@ -117,13 +120,15 @@ def getExtrafanart(htmlcode):  # 获取剧照
        extrafanart_pather = re.compile(r'<a class=\"sample-box\" href=\"(.*?)\"')
        extrafanart_imgs = extrafanart_pather.findall(html)
        if extrafanart_imgs:
-            return extrafanart_imgs
+            return [urljoin('https://www.javbus.com',img) for img in extrafanart_imgs]
    return ''

 def main_uncensored(number):
    htmlcode = get_html('https://www.javbus.com/ja/' + number)
    if getTitle(htmlcode) == '':
        htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-','_'))
+    if "<title>404 Page Not Found" in htmlcode:
+        raise Exception('404 page not found')
    dic = {
        'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''),
        'studio': getStudio(htmlcode),
@@ -155,6 +160,8 @@ def main(number):
                htmlcode = get_html('https://www.fanbus.us/' + number)
            except:
                htmlcode = get_html('https://www.javbus.com/' + number)
+            if "<title>404 Page Not Found" in htmlcode:
+                raise Exception('404 page not found')
            dic = {
                'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))),
                'studio': getStudio(htmlcode),
@@ -180,7 +187,7 @@ def main(number):
        except:
            return main_uncensored(number)
    except Exception as e:
-        if config.Config().debug():
+        if config.getInstance().debug():
            print(e)
        data = {
            "title": "",
@@ -191,5 +198,7 @@ def main(number):
        return js

 if __name__ == "__main__" :
+    print(main('ADV-R0624'))    # 404
    print(main('ipx-292'))
    print(main('CEMD-011'))
+    print(main('CJOD-278'))
--- a/WebCrawler/javdb.py
+++ b/WebCrawler/javdb.py
@@ -5,7 +5,7 @@ from lxml import etree
 import json
 from bs4 import BeautifulSoup
 from ADC_function import *
-from WebCrawler import airav
+import secrets
 # import sys
 # import io
 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
@@ -21,7 +21,7 @@ def getActor(a):
    genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class')
    r = []
    idx = 0
-    actor_gendor = config.Config().actor_gender()
+    actor_gendor = config.getInstance().actor_gender()
    if not actor_gendor in ['female','male','both','all']:
        actor_gendor = 'female'
    for act in actors:
@@ -67,9 +67,15 @@ def getStudio(a):
    patherr = re.compile(r'<strong>片商\:</strong>[\s\S]*?<a href=\".*?>(.*?)</a></span>')
    pianshang = patherr.findall(a)
    if pianshang:
-        result = pianshang[0]
-    else:
-        result = ""
+        result = pianshang[0].strip()
+        if len(result):
+            return result
+    # 以卖家作为工作室
+    html = etree.fromstring(a, etree.HTMLParser())
+    try:
+        result = str(html.xpath('//strong[contains(text(),"賣家:")]/../span/a/text()')).strip(" ['']")
+    except:
+        result = ''
    return result

 def getRuntime(a):
@@ -171,16 +177,13 @@ def getTrailer(htmlcode):  # 获取预告片
    return video_url

 def getExtrafanart(htmlcode):  # 获取剧照
-    html_pather = re.compile(r'<div class=\"tile\-images preview\-images\">[\s\S]*?</a>\s+?</div>\s+?</div>')
-    html = html_pather.search(htmlcode)
-    if html:
-        html = html.group()
-        extrafanart_pather = re.compile(r'<a class="tile-item" href=\"(.*?)\"')
-        extrafanart_imgs = extrafanart_pather.findall(html)
-        if extrafanart_imgs:
-            return extrafanart_imgs
-    return ''
-
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    result = []
+    try:
+        result = html.xpath("//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href")
+    except:
+        pass
+    return result
 def getCover(htmlcode):
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    try:
@@ -195,11 +198,13 @@ def getDirector(a):
    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
 def getOutline(number):  #获取剧情介绍
    try:
-        response = json.loads(airav.main(number))
-        result = response['outline']
+        htmlcode = get_html('https://cn.airav.wiki/video/' + number)
+        from WebCrawler.airav import getOutline as airav_getOutline
+        result = airav_getOutline(htmlcode)
        return result
    except:
-        return ''
+        pass
+    return ''
 def getSeries(a):
    #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
@@ -208,7 +213,7 @@ def getSeries(a):
    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')

 def main(number):
-    javdb_site = random.choice(["javdb9", "javdb30"])
+    javdb_site = secrets.choice(["javdb9", "javdb30"])
    try:
        # if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number).group():
        #     pass
@@ -303,8 +308,16 @@ f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not b
            'series': getSeries(detail_page),

        }
+        if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A):
+            dic['actor'].append('素人')
+            if not dic['series']:
+                dic['series'] = dic['studio']
+            if not dic['label']:
+                dic['label'] = dic['studio']
+
+
    except Exception as e:
-        if config.Config().debug():
+        if config.getInstance().debug():
            print(e)
        dic = {"title": ""}
    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
@@ -316,7 +329,9 @@ if __name__ == "__main__":
    # print(main('blacked.20.05.30'))
    # print(main('AGAV-042'))
    # print(main('BANK-022'))
-    print(main('FC2-735670'))
-    print(main('FC2-1174949')) # not found
+    print(main('093021_539'))  # 没有剧照 片商pacopacomama
+    # print(main('FC2-2278260'))
+    # print(main('FC2-735670'))
+    # print(main('FC2-1174949')) # not found
    print(main('MVSD-439'))
-    print(main('EHM0001')) # not found
+    # print(main('EHM0001')) # not found
--- a/WebCrawler/mgstage.py
+++ b/WebCrawler/mgstage.py
@@ -137,7 +137,7 @@ def main(number2):
            'series': getSeries(a),
        }
    except Exception as e:
-        if config.Config().debug():
+        if config.getInstance().debug():
            print(e)
        dic = {"title": ""}

--- a/WebCrawler/xcity.py
+++ b/WebCrawler/xcity.py
@@ -224,7 +224,7 @@ def main(number):
            'series': getSeries(detail_page),
        }
    except Exception as e:
-        if config.Config().debug():
+        if config.getInstance().debug():
            print(e)
        dic = {"title": ""}