忘记上次在哪看的91论坛多线程下载了,今天找了几个地方没找到,百度出来一个,测试可用。
pip install lxml
pip install requests
# -*- coding: utf-8 -*-
# [url=home.php?mod=space&uid=35117]@time[/url] : 2018/9
# @Author : grhgrhrthh
# [url=home.php?mod=space&uid=25602]@email[/url] : fwefwefwef
# [url=home.php?mod=space&uid=10952]@file[/url] : 91_pron_pic.py
# @Software: PyCharm
import urllib.request
from lxml import etree
import os
import requests
import random
import string
import re
"""
91pron图片站爬虫
"""
class Pron_91():
def __init__(self, url):
self.url = url
self.domain = "http://" + str(self.url).split('/')[2] + "/"
self.ua_header = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}
def craw_urls(self):
request = urllib.request.Request(self.url, headers=self.ua_header)
response = urllib.request.urlopen(request)
xmlselector = etree.HTML(response.read().decode('utf-8'))
urls = set(xmlselector.xpath('//span/a[@style="font-weight: bold;color: #8F2A90"]/@href'))
urls_list = []
for url in urls:
urls_list.append(self.domain + url)
return urls_list
def random_file_name(self):
return ''.join(random.sample(string.ascii_letters + string.digits, 12))
def craw_pic(self):
for link in self.craw_urls():
request = urllib.request.Request(link, headers=self.ua_header)
response = urllib.request.urlopen(request)
xmlselector = etree.HTML(response.read().decode('utf-8'))
title = xmlselector.xpath('//h1')
if str(title[0].text) in os.listdir('F:axz'):
print('{} --- 资源在文件夹中,将解析下一个'.format(str(title[0].text)))
continue
elif str(title[0].text).find("/") > 0:
tihuan = str(title[0].text).replace('/', '_')
if tihuan in os.listdir('F:axz'):
continue
os.mkdir('F:axz' + tihuan)
print(tihuan + " ---> 文件夹创建完成")
pic = xmlselector.xpath('//div[@class="t_attach"]/a/@href')
print("共{}张图片,请等待......".format(pic))
x = 1
for i in pic:
print("正在下载第{}图片".format(x))
r = requests.get(self.domain + str(i))
with open("F:axz" + tihuan + "{}.jpg".format(self.random_file_name()), 'wb') as f:
f.write(r.content)
x += 1
print(tihuan + " ---> 已经下载完毕")
print("-" * 30)
elif str(title[0].text).find(":") > 0:
tihuan = str(title[0].text).replace(':', '')
if tihuan in os.listdir('F:axz'):
continue
os.mkdir('F:axz' + tihuan)
print(tihuan + " ---> 文件夹创建完成")
pic = xmlselector.xpath('//div[@class="t_attach"]/a/@href')
print("共{}张图片,请等待......".format(pic))
x = 1
for i in pic:
print("正在下载第{}图片".format(x))
r = requests.get(self.domain + str(i))
with open("F:axz" + tihuan + "{}.jpg".format(self.random_file_name()), 'wb') as f:
f.write(r.content)
x += 1
print(tihuan + " ---> 已经下载完毕")
print("-" * 30)
else:
if str(title[0].text) in os.listdir('F:axz'):
print('{} --- 资源在文件夹中,将解析下一个'.format(str(title[0].text)))
continue
elif str(title[0].text).endswith(".") == True:
print('{} --- 资源在文件夹中,将解析下一个'.format(str(title[0].text)))
continue
os.mkdir('F:axz' + title[0].text)
print(title[0].text + " ---> 文件夹创建完成")
pic = xmlselector.xpath('//div[@class="t_attach"]/a/@href')
if len(pic) == 0:
continue
print("共{}张图片,请等待......".format(len(pic)))
x = 1
for i in pic:
print("正在下载第{}图片".format(x))
r = requests.get(self.domain + str(i))
with open("F:axz" + title[0].text + "{}.jpg".format(self.random_file_name()), 'wb') as f:
f.write(r.content)
x += 1
print(title[0].text + " ---> 已经下载完毕")
print("-" * 30)
if __name__ == '__main__':
pron = Pron_91("https:///forumdisplay.php?fid=19&page=3")
pron.craw_pic()
aaa=3
while aaa > 0:
pron = Pron_91("https:///forumdisplay.php?fid=19&page=" + str(aaa))
pron.craw_pic()
aaa += 1
print("下一页" * 3)
print(str(aaa))
复制代码
用户评论 0
暂无评论