fix carib and 1pond number issues

2021-05-05 12:00:10 +08:00
parent 07026f89f8
commit 10e35cbd92
5 changed files with 147 additions and 33 deletions
--- a/ADC_function.py
+++ b/ADC_function.py
@@ -331,7 +331,7 @@ def translateTag_to_sc(tag):
                        '女装人妖', '及膝襪': '及膝袜', '泡泡襪': '泡泡袜', '空中小姐': '空中小姐', '旗袍': '旗袍', '兔女郎': '兔女郎',
                    '女祭司': '女祭司', '動畫人物': '动画人物', '迷你裙警察': '迷你裙警察', '成熟的女人': '成熟的女人', '巨乳': '巨乳',
                    '蘿莉塔': '萝莉塔', '無毛': '无毛', '屁股': '屁股', '苗條': '苗条', '素人': '素人', '乳房': '乳房',
-                    '巨大陰莖': '巨大阴茎', '胖女人': '胖女人', '平胸': '平胸', '高': '高', '美腳': '美脚', '孕婦': '孕妇',
+                    '巨大陰莖': '巨大阴茎', '胖女人': '胖女人', '平胸': '平胸', '高': '高', '美腳': '美腿', '孕婦': '孕妇',
                    '巨大屁股': '巨大屁股', '瘦小身型': '瘦小身型', '變性者': '变性者', '肌肉': '肌肉', '超乳': '超乳', '乳交':
                        '乳交', '中出': '中出', '多P': '多P', '69': '69', '淫語': '淫语', '女上位': '女上位', '自慰': '自慰',
                    '顏射': '颜射', '潮吹': '潮吹', '口交': '口交', '舔陰': '舔阴', '肛交': '肛交', '手指插入': '手指插入',
@@ -359,7 +359,7 @@ def translateTag_to_sc(tag):
                    '動画': '视频', '電子書籍': '电子书', '同人': '同人志', 'アダルトPCゲーム': '成人PC游戏', 'DVD/CD':
                        ' DVD/CD', 'コミック': '漫画', 'いろいろレンタル': '各种租赁', '通販': '购物', 'マーケットプレイス': '市场',
                    '3Dプリント': ' 3D打印', 'ロボット': '机器人', '巨乳': '巨乳', '熟女': '熟女', 'ギャル': '美少女',
-                    '人妻・主婦': '人妻', '女子校生': '高中女生', '中出し': '中出', 'アナル': '肛门', 'ニューハーフ': '变性人',
+                    '人妻・主婦': '人妻', '女子校生': '高中女生', '中出し': '中出', 'アナル': '肛交', 'ニューハーフ': '变性人',
                    'VR専用': 'VR专用', 'ハイクオリティVR': '高质量VR', 'アイドル・芸能人': '偶像/名人', 'アクメ・オーガズム':
                        '性高潮', 'アスリート': '运动员', '姉・妹': '姐妹', 'イタズラ': '恶作剧', 'インストラクター': '指导员',
                    'ウェイトレス': '服务员', '受付嬢': '接待员', 'エステ': '美容院', 'M男': 'M男', 'M女': 'M女', 'OL':
@@ -381,7 +381,7 @@ def translateTag_to_sc(tag):
                    '面接': '面试', 'モデル': '模特', '野外・露出': '野外・露出', 'ヨガ': '瑜伽', '乱交': '狂欢', '旅行': '旅行',
                    'レースクィーン': '种族女王', '若妻・幼妻': '年轻妻子/年轻妻子', 'アジア女優': '亚洲女演员', '巨尻': '大屁股', '筋肉':
                        '肌肉', '小柄': '娇小', '黒人男優': '黑人演员', '処女': '处女', '女装・男の娘': '伪娘', 'スレンダー':
-                        '苗条', '早漏': '早泄', 'そっくりさん': '相似', '長身': '高大', '超乳': '巨乳', 'デカチン・巨根':
+                        '苗条', '早漏': '早泄', 'そっくりさん': '相似', '長身': '高个', '超乳': '巨乳', 'デカチン・巨根':
                        '大鸡巴', '童貞': '处女', '軟体': '柔软的身体', '妊婦': '孕妇', '白人女優': '白人女演员', 'パイパン': '剃光',
                    '日焼け': '晒伤', '貧乳・微乳': '贫乳/小乳房', '美少女': '美少女', '美乳': ' 美乳',
                    'ふたなり': ' 双胞胎', 'ぽっちゃり': ' 丰满', 'ミニ系': ' 迷你系', '学生服':
@@ -433,7 +433,7 @@ def translateTag_to_sc(tag):
                    '競泳・スクール水着': '游泳学校的游泳衣', '素人': '素人', 'ベスト・総集編': '精选集', '美乳': '美乳', '美少女': '美少女',
                    '職業色々': '各种职业', '配信専用': '配信专用', '電マ': '电码', '顔射': '颜射', 'アイドル・芸能人': '偶像艺人',
                    'アクション・格闘': '格斗动作', '足コキ': '足交', '脚フェチ': '脚控', 'アジア女優': '亚洲女演员', '汗だく': '满头大汗',
-                    'アナルセックス': '肛门性爱', 'アナル': '肛门', '姉・妹': '姐姐、妹妹', 'Eカップ': 'E罩杯', 'イタズラ': '恶作剧',
+                    'アナルセックス': '肛门性爱', 'アナル': '肛交', '姉・妹': '姐姐、妹妹', 'Eカップ': 'E罩杯', 'イタズラ': '恶作剧',
                    '異物挿入': '插入异物', 'イメージビデオ': '视频图像', '色白': '白皙', '淫語': '淫语', '淫語モノ': '淫语故事',
                    'インストラクター': '教练', '飲尿': '饮尿', '淫乱・ハード系': '淫乱硬系', 'ウェイトレス': '女服务生', 'Hカップ':
                        'H罩杯', 'SF': 'SF', 'SM': 'SM', 'Fカップ': 'F罩杯', 'M男': 'M男', 'お母さん': '妈妈',
@@ -449,7 +449,7 @@ def translateTag_to_sc(tag):
                        '女医生', '女教師': '女教师', '女子アナ': '女主播', '女子校生': '女学生', '女子大生': '女大学生', '女性向け':
                        '面向女性', '女装・男の娘': '伪娘', 'Gカップ': 'G罩杯', 'スカトロ': '蹲', 'スチュワーデス・CA': '空姐CA',
                    'スポーツ': '体育运动', '清楚': '清秀', '制服': '制服', 'その他フェチ': '其他恋物癖', '体操着・ブルマ': '运动服',
-                    '多人数': '很多人', '着エロ': '色情', '長身': '高个子', '痴漢': '痴汉', '手コキ': '手锯', '手マン': '手艺人',
+                    '多人数': '很多人', '着エロ': '色情', '長身': '高个子', '痴漢': '痴汉', '手コキ': '手淫', '手マン': '手艺人',
                    'Dカップ': 'D罩杯', '泥酔': '烂醉如泥', 'デカチン・巨根': '巨根', '盗撮': '偷拍', '盗撮・のぞき': '偷拍', '童貞':
                        '处男', 'ドキュメンタリー': '记录片', 'ドラッグ・媚薬': '药局', 'ドラマ': '电视剧', 'ニューハーフ': '变性人',
                    'ニーソックス': '过膝袜', '妊婦': '孕妇', '寝取り・寝取られ': '睡下', 'HowTo': 'HowTo',
@@ -460,7 +460,12 @@ def translateTag_to_sc(tag):
                    '３P・乱交': '3P・乱交', '野外・露出': '野外露出', '海外': '国外', 'レズ': '女士', 'アニメ': '动画',
                    'アダルト': '成人', 'アイドル': '空闲', '個人撮影': '个人摄影', '無修正': '无修正', 'コスプレ': '角色扮演',
                    '下着': '内衣', '水着': '游泳衣', 'パンチラ': '小册子', 'フェラ': '口交', 'モデル': '模型', '中出し': '中出', '可愛い': '可爱',
-                    'オリジナル': '原始', '貧乳': '贫乳', 'オナニー': '自慰', 'パイパン': '菠萝', 'ロリ': '萝莉', '生ハメ': '第一人称'
+                    'オリジナル': '原始', '貧乳': '贫乳', 'オナニー': '自慰', 'パイパン': '菠萝', 'ロリ': '萝莉', '生ハメ': '第一人称',
+
+
+                    #caribbeancom
+                    '青姦': '野战', '初裏': '破处', 'ブルマー': '体操服', 'クンニ ベスト/オムニバス':'汇编',  'クンニ': '舔阴',
+
                    }
        try:
            return dict_gen[tag]
--- a/WebCrawler/carib.py
+++ b/WebCrawler/carib.py
@@ -0,0 +1,93 @@
+import sys
+sys.path.append('../')
+import json
+from bs4 import BeautifulSoup
+from lxml import html
+import re
+import urllib.request
+import socket
+from ADC_function import *
+
+def get_html(url):
+    socket.setdefaulttimeout(10)
+    papg = urllib.request.urlopen(url)
+    htm = papg.read()
+    htm = htm.decode("euc_jp")
+    return htm
+
+def main(number: str) -> json:
+    try:
+        caribhtml = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html')
+
+        soup = BeautifulSoup(caribhtml, "html.parser")
+        lx = html.fromstring(str(soup))
+
+        if not soup.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
+            raise ValueError("page info not found")
+    except:
+        dic = {"title": ""}
+        return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
+    dic = {
+        'title': get_title(lx),
+        'studio': '加勒比',
+        'year': get_year(lx),
+        'outline': '',
+        'runtime': get_runtime(lx),
+        'director': '',
+        'actor': get_actor(lx),
+        'release': get_release(lx),
+        'number': number,
+        'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
+        'tag': get_tag(lx),
+        'extrafanart': get_extrafanart(lx),
+        'label': '',
+        'imagecut': 0,
+        'actor_photo': '',
+        'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
+        'source': 'carib.py',
+        'series': '',
+    }
+    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
+    return js
+
+def get_title(lx: html.HtmlElement) -> str:
+    return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip()
+
+def get_year(lx: html.HtmlElement) -> str:
+    return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
+
+def get_release(lx: html.HtmlElement) -> str:
+    return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
+
+def get_actor(lx: html.HtmlElement) -> str:
+    r = []
+    actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()")
+    for act in actors:
+        if str(act) != '他':
+            r.append(act)
+    return r
+
+def get_tag(lx: html.HtmlElement) -> str:
+    r = []
+    genres = lx.xpath("//span[@class='spec-content']/a[@itemprop='genre']/text()")
+    for g in genres:
+        r.append(translateTag_to_sc(str(g)))
+    return r
+
+def get_extrafanart(lx: html.HtmlElement) -> str:
+    r = []
+    genres = lx.xpath("//*[@id='sampleexclude']/div[2]/div/div[@class='grid-item']/div/a/@href")
+    for g in genres:
+        jpg = str(g)
+        if '/member/' in jpg:
+            break
+        else:
+            r.append('https://www.caribbeancom.com' + jpg)
+    return r
+
+def get_runtime(lx: html.HtmlElement) -> str:
+    return str(lx.xpath( "//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip()
+
+if __name__ == "__main__":
+    print(main("041721-001"))
+    print(main("080520-001"))
--- a/config.ini
+++ b/config.ini
@@ -26,7 +26,7 @@ max_title_len= 50
 update_check=1

 [priority]
-website=javbus,javdb,airav,fanza,xcity,mgstage,fc2,avsox,jav321,dlsite
+website=javbus,javdb,airav,fanza,xcity,mgstage,fc2,avsox,jav321,dlsite,carib

 [escape]
 literals=\()/
--- a/core.py
+++ b/core.py
@@ -23,6 +23,7 @@ from WebCrawler import mgstage
 from WebCrawler import xcity
 # from WebCrawler import javlib
 from WebCrawler import dlsite
+from WebCrawler import carib


 def escape_path(path, escape_literals: str):  # Remove escape literals
@@ -63,6 +64,7 @@ def get_data_from_json(file_number, filepath, conf: config.Config):  # 从JSON
        "xcity": xcity.main,
        # "javlib": javlib.main,
        "dlsite": dlsite.main,
+        "carib": carib.main,
    }

    # default fetch order list, from the beginning to the end
@@ -70,19 +72,25 @@ def get_data_from_json(file_number, filepath, conf: config.Config):  # 从JSON

    # if the input file name matches certain rules,
    # move some web service to the beginning of the list
-    if "avsox" in sources and (re.match(r"^\d{5,}", file_number) or
-        "HEYZO" in file_number or "heyzo" in file_number or "Heyzo" in file_number
+    lo_file_number = file_number.lower()
+    if "carib" in sources and (re.match(r"^\d{6}-\d{3}", file_number)
    ):
+        sources.insert(0, sources.pop(sources.index("carib")))
+    elif "avsox" in sources and (re.match(r"^\d{5,}", file_number) or
+        "heyzo" in lo_file_number
+    ):
+        # if conf.debug() == True:
+        #     print('[+]select avsox')
        sources.insert(0, sources.pop(sources.index("avsox")))
    elif "mgstage" in sources and (re.match(r"\d+\D+", file_number) or
-        "siro" in file_number or "SIRO" in file_number or "Siro" in file_number
+        "siro" in lo_file_number
    ):
        sources.insert(0, sources.pop(sources.index("mgstage")))
-    elif "fc2" in sources and ("fc2" in file_number or "FC2" in file_number
+    elif "fc2" in sources and ("fc2" in lo_file_number
    ):
        sources.insert(0, sources.pop(sources.index("fc2")))
    elif "dlsite" in sources and (
-        "RJ" in file_number or "rj" in file_number or "VJ" in file_number or "vj" in file_number
+        "rj" in lo_file_number
    ):
        sources.insert(0, sources.pop(sources.index("dlsite")))

@@ -364,7 +372,7 @@ def download_file_with_filename(url, filename, path, conf: config.Config, filepa
                r = requests.get(url, headers=headers, timeout=timeout, proxies=proxies)
                if r == '':
                    print('[-]Movie Data not found!')
-                    return 
+                    return
                with open(str(path) + "/" + filename, "wb") as code:
                    code.write(r.content)
                return
@@ -376,7 +384,7 @@ def download_file_with_filename(url, filename, path, conf: config.Config, filepa
                r = requests.get(url, timeout=timeout, headers=headers)
                if r == '':
                    print('[-]Movie Data not found!')
-                    return 
+                    return
                with open(str(path) + "/" + filename, "wb") as code:
                    code.write(r.content)
                return
@@ -636,14 +644,14 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config
                os.rename(filepath.replace(houzhui, subname), path + '/' + number + leak_word + c_word + subname)
                print('[+]Sub moved!')
                return True
-        
+
    except FileExistsError:
        print('[-]File Exists! Please check your movie!')
        print('[-]move to the root folder of the program.')
-        return 
+        return
    except PermissionError:
        print('[-]Error! Please run as administrator!')
-        return 
+        return


 def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word, conf):  # 文件路径，番号，后缀，要移动至的位置
@@ -667,7 +675,7 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo
    except FileExistsError:
        print('[-]File Exists! Please check your movie!')
        print('[-]move to the root folder of the program.')
-        return 
+        return
    except PermissionError:
        print('[-]Error! Please run as administrator!')
        return
@@ -736,7 +744,7 @@ def core_main(file_path, number_th, conf: config.Config):
    if '-c.' in filepath or '-C.' in filepath or '中文' in filepath or '字幕' in filepath:
        cn_sub = '1'
        c_word = '-C'  # 中文字幕影片后缀
-    
+
    # 判断是否无码
    if is_uncensored(number):
        uncensored = 1
@@ -761,7 +769,7 @@ def core_main(file_path, number_th, conf: config.Config):
    # main_mode
    #  1: 刮削模式 / Scraping mode
    #  2: 整理模式 / Organizing mode
-    #  3：不改变路径刮削 
+    #  3：不改变路径刮削
    if conf.main_mode() == 1:
        # 创建文件夹
        path = create_folder(conf.success_folder(),  json_data.get('location_rule'), json_data, conf)
@@ -780,7 +788,7 @@ def core_main(file_path, number_th, conf: config.Config):
                trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath, conf, conf.failed_folder())
        except:
            pass
-        
+
        try:
            # 下载剧照 data, path, conf: config.Config, filepath, failed_folder
            if json_data.get('extrafanart'):
@@ -800,7 +808,7 @@ def core_main(file_path, number_th, conf: config.Config):
        thumb_path = path + '/' + number + leak_word + c_word + '-thumb.jpg'
        if conf.is_watermark():
            add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf)
-        
+
    elif conf.main_mode() == 2:
        # 创建文件夹
        path = create_folder(conf.success_folder(), json_data.get('location_rule'), json_data, conf)
@@ -810,7 +818,7 @@ def core_main(file_path, number_th, conf: config.Config):
        thumb_path = path + '/' + number + leak_word + c_word + '-thumb.jpg'
        if conf.is_watermark():
            add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf)
-        
+
    elif conf.main_mode() == 3:
        path = file_path.rsplit('/', 1)[0]
        path = path.rsplit('\\', 1)[0]
--- a/number_parser.py
+++ b/number_parser.py
@@ -36,12 +36,16 @@ def get_number(debug,filepath: str) -> str:
                #filepath = filepath.replace("_", "-")
                filepath.strip('22-sht.me').strip('-HD').strip('-hd')
                filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath))  # 去除文件名中时间
-                if 'FC2' or 'fc2' in filename:
-                    filename = filename.replace('PPV', '').replace('ppv', '').replace('--', '-').replace('_', '-')
+                lower_check = filename.lower()
+                if 'fc2' in lower_check:
+                    filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper()
                file_number = re.search(r'\w+-\w+', filename, re.A).group()
-                tokyo_hot_check = filename.lower()
-                if "tokyo" in tokyo_hot_check and "hot" in tokyo_hot_check:
-                    file_number = re.search(r'(cz|k|n|red-|se)\d{3,4}', tokyo_hot_check, re.A).group()
+                if "tokyo" in lower_check and "hot" in lower_check:
+                    file_number = re.search(r'(cz|k|n|red-|se)\d{3,4}', lower_check, re.A).group()
+                if "carib" in lower_check:
+                    file_number = str(re.search(r'\d{6}(-|_)\d{3}', lower_check, re.A).group()).replace('_', '-')
+                if "1pon" in lower_check:
+                    file_number = str(re.search(r'\d{6}(-|_)\d{3}', lower_check, re.A).group()).replace('-', '_')
                return file_number
            else:  # 提取不含减号-的番号，FANZA CID
                # 欧美番号匹配规则
@@ -64,12 +68,16 @@ def get_number(debug,filepath: str) -> str:
            #filepath = filepath.replace("_", "-")
            filepath.strip('22-sht.me').strip('-HD').strip('-hd')
            filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath))  # 去除文件名中时间
-            if 'FC2' or 'fc2' in filename:
-                filename = filename.replace('PPV', '').replace('ppv', '').replace('--', '-').replace('_', '-')
+            lower_check = filename.lower()
+            if 'fc2' in lower_check:
+                filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper()
            file_number = re.search(r'\w+-\w+', filename, re.A).group()
-            tokyo_hot_check = filename.lower()
-            if "tokyo" in tokyo_hot_check and "hot" in tokyo_hot_check:
-                    file_number = re.search(r'(cz|k|n|red-|se)\d{3,4}', tokyo_hot_check, re.A).group()
+            if "tokyo" in lower_check and "hot" in lower_check:
+                file_number = re.search(r'(cz|k|n|red-|se)\d{3,4}', lower_check, re.A).group()
+            if "carib" in lower_check:
+                file_number = str(re.search(r'\d{6}(-|_)\d{3}', lower_check, re.A).group()).replace('_', '-')
+            if "1pon" in lower_check:
+                file_number = str(re.search(r'\d{6}(-|_)\d{3}', lower_check, re.A).group()).replace('-', '_')
            return file_number
        else:  # 提取不含减号-的番号，FANZA CID
            # 欧美番号匹配规则