add code shtjgb&shbjnq&shyq

This commit is contained in:
yiliyassh@163.com
2022-05-22 17:53:23 +08:00
parent 764b00913d
commit fc9772fd33
6 changed files with 200 additions and 9 deletions

70
code/shtjgb.py Normal file
View File

@@ -0,0 +1,70 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author: yiliyas
# Date: 2022.5.22
import re
from urllib import request
# 1.获取数据
def get_html(url):
# https 模拟浏览器头
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
req = request.Request(url,headers=headers)
response=request.urlopen(req)
# 根据实际地址修改字符集
content=response.read().decode('utf-8')
return content
# 2.处理数据(设置匹配条件)
def get_url(content):
# 根据地址不同,判断设置不同的匹配条件
pattern = re.compile('<li> <p> <a target="_blank" href="(.*?)" title="(.*?)">.*?</a> <span>(.*?)</span> </p> </li>',re.S)
items = re.findall(pattern,content)
return items
# 3.显示数据
def show_result(items):
# 根据地址不同,判断设置不同的匹配条件
for item in items:
curl=item[0]
title=item[1]
cdate=item[2]
# 生成完整的链接地址
strs='|['+title+']('+curl+')|'+cdate
# 显示链接信息
if str(title).find('公报')>-1:
print(strs)
# 下载数据
#get_file(curl)
# 4.下载数据(写入文件)
def get_file(url):
file_name = url.split('/')[-1]
# 修改路径
file_names='E:/download/'+file_name
req = request.Request(url)
u = request.urlopen(req)
f = open(file_names, 'wb')
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
f.write(buffer)
f.close()
print ("Sucessful to download" + " " + file_name)
# 下载2002~2021
#url = 'http://tjj.sh.gov.cn/tjgb/index.html'
#url = 'http://tjj.sh.gov.cn/tjgb/index_2.html'
url = 'http://tjj.sh.gov.cn/tjgb/index_3.html'
# 获取信息
html = get_html(url)
# 获取下载地址
items = get_url(html)
# 下载显示数据
show_result(items)

67
code/shtjnq.py Normal file
View File

@@ -0,0 +1,67 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author: yiliyas
# Date: 2022.5.22
import re
from urllib import request
# 1.获取数据
def get_html(url):
# https 模拟浏览器头
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
req = request.Request(url,headers=headers)
response=request.urlopen(req)
# 根据实际地址修改字符集
content=response.read().decode('utf-8')
return content
# 2.处理数据(设置匹配条件)
def get_url(content):
# 根据地址不同,判断设置不同的匹配条件
pattern = re.compile('<li> <p> <a target="_blank" href="(.*?)" title="(.*?)">(.*?)</a> </p> </li>',re.S)
items = re.findall(pattern,content)
return items
# 3.显示数据
def show_result(items):
# 根据地址不同,判断设置不同的匹配条件
for item in items:
curl=item[0]
title=item[1]
cname=item[2]
# 生成完整的链接地址
strs=cname+'|'+curl
# 显示链接信息
print(strs)
# 下载数据
#get_file(curl)
# 4.下载数据(写入文件)
def get_file(url):
file_name = url.split('/')[-1]
# 修改路径
file_names='E:/download/'+file_name
req = request.Request(url)
u = request.urlopen(req)
f = open(file_names, 'wb')
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
f.write(buffer)
f.close()
print ("Sucessful to download" + " " + file_name)
# 下载2002~2021
url = 'http://tjj.sh.gov.cn/tjnj/index.html'
# 获取信息
html = get_html(url)
# 获取下载地址
items = get_url(html)
# 下载显示数据
show_result(items)

50
code/shyq.py Normal file
View File

@@ -0,0 +1,50 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author: yiliyas
# date: 2022.4.24
from urllib import request
import re
# 上海市商业联合会抗疫保供产销对接信息平台生活物资供应名录json格式数据类别统计
url = 'https://c.kdcer.com/sh_keep_supply/home/categories'
req = request.Request(url)
response=request.urlopen(req)
content=response.read().decode('utf-8-sig')
# 获得类型id,类别,企业名
pattern = re.compile('"name":"(.*?)",.*?,"count":(.*?)}',re.S)
items = re.findall(pattern,content)
m=0
for item in items:
m = m+1
cname=item[0]
cnum=item[1]
# 获得全部企业——总数
if str(cname).find("全部企业")>-1:
nums = int(cnum)
# 输入markdown格式
strs='|'+cname+'|'+str(cnum)
print(strs)
# 打印markdown表头
title2='\n|序号|类型|点击(详细工商信息)|\n'+'|:----|:----|:----|'
print(title2)
n=0
# 获得总页数每页20个
m = int(nums/20)+1
for i in range(1,nums):
url = 'https://c.kdcer.com/sh_keep_supply/home/enterprises?categoryId=-1&keyword=&page='+str(i)+'&limit=20'
req = request.Request(url)
response=request.urlopen(req)
content=response.read().decode('utf-8-sig')
# 获得类型id,类别,企业名
pattern = re.compile('"name":"(.*?)",".*?,"categoryName":"(.*?)"',re.S)
items = re.findall(pattern,content)
for item in items:
n=n+1
cname=item[0]
cgname=item[1]
strs='|'+str(n)+'|'+cgname+'|'+cname
print(strs)