在博库工作了一周时间,前两天主要熟悉博库自己开发的微信公众平台,后三天就是改前置机代码为代理模式,然后通过代理去获取图书馆数据。
期间用了LIB_parse.php 这个函数库去获取页面数据,通过curl模拟请求。
昨天在家休息就用LIB_parse.php这个函数库去写采集规则,当时看到华声论坛的照片挺不错的,就决定拿它练手。
<?php
set_time_limit(0);
include_once ‘LIB_parse.php’;
function get_curl_data($url)
{
//初始化
$curl = curl_init();
//设置抓取的url
curl_setopt($curl, CURLOPT_URL, $url);
//设置头文件的信息作为数据流输出
curl_setopt($curl, CURLOPT_HEADER, 0);
//设置获取的信息以文件流的形式返回,而不是直接输出。
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
//执行命令
$data = curl_exec($curl);
//关闭URL请求
curl_close($curl);
return $data;
}
$data = get_curl_data(‘http://bbs.voc.com.cn/forumdisplay.php?fid=50&orderby=views&page=1’);
$tds = parse_array($data, ‘<td class=”tbfltd2″‘, ‘</td>’);
foreach ($tds as $k => $v) {
$as[] = parse_array($v, ‘<a’, ‘</a>’);
}
$as = array_filter($as);
foreach ($as as $k => $v) {
$res[$k][‘title’] = strip_tags(return_between($v[0], ‘<a’, ‘</a>’, INCL));
foreach ($v as $k1 => $v1) {
$res[$k][‘href’][] = get_attribute($v1, ‘href’);
}
}
// foreach ($res[182][‘href’] as $k => $v) {
// $detail = get_curl_data(‘http://bbs.voc.com.cn/’ . $v);
// $detail = parse_array($detail, ‘<div id=”‘, ‘</div>’);
// foreach ($detail as $k1 => $v1) {
// $res2[] = get_attribute($v1, ‘src’);
// }
// }
die;
foreach ($res as $k => $v) {
foreach ($v[‘href’] as $k1 => $v1) {
$detail = get_curl_data(‘http://bbs.voc.com.cn/’ . $v1);
$res2[] = parse_array($detail, ‘<div id=”‘, ‘</div>’);
}
}
var_dump($res2);die;
foreach ($detail as $k => $v) {
$res3[] = get_attribute($v2, ‘src’);
}
var_dump($res3);
这段代码是无法正常运行的,因为循环次数过多,会导致页面等待,无限转圈圈。
今天起来后,决定用python试着写下,花了一上午的时间,通过查资料,写规则,完成了简易的采集。
#!/usr/bin/python # coding:utf-8 import urllib.request,socket,re,sys,os,math,gzip from bs4 import BeautifulSoup # url = 'http://bbs.voc.com.cn/topic-7652393-9-1.html' # # headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' # 'Chrome/51.0.2704.63 Safari/537.36'} # req = urllib.request.Request(url=url,headers=headers) # res = urllib.request.urlopen(req) # data = res.read() # #添加一个解析器 # soup = BeautifulSoup(data,'html5lib') # print(soup.img) # 从文档中找到所有<a>标签的内容 # for link in soup.find_all('img'): # print(link.get('src')) #定义文件保存路径 NAME = ''; PAGE = '' def saveImg(path,NAME): # 检测当前路径的有效性 targetPath = "G:\\python\\images\\" + NAME + "" if not os.path.isdir(targetPath): os.mkdir(targetPath) # 设置每个图片的路径 pos = path.rindex('/') t = os.path.join(targetPath, path[pos + 1:]) return t # 定义保存文件函数 def saveFile(data, i): path = "E:\\projects\\Spider\\06_csdn2\\papers\\paper_" + str(i + 1) + ".txt" file = open(path, 'wb') page = '当前页:' + str(i + 1) + '\n' file.write(page.encode('gbk')) # 将博文信息写入文件(以utf-8保存的文件声明为gbk) for d in data: d = str(d) + '\n' file.write(d.encode('gbk')) file.close() # 解压缩数据 def ungzip(data): try: # print("正在解压缩...") data = gzip.decompress(data) # print("解压完毕...") except: print("未经压缩,无需解压...") return data # 从文档中找到所有文字内容 # print(soup.get_text()) class list: def __init__(self,pageIdx=1,url=""): #默认当前页 self.pageIdx = pageIdx self.url = 'http://bbs.voc.com.cn/forum-50-1.html' self.headers = { "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", } # 求总页数 def getListPages(self): req = urllib.request.Request(url=self.url, headers=self.headers) page = urllib.request.urlopen(req) # 从我的csdn博客主页抓取的内容是压缩后的内容,先解压缩 data = page.read() # data = ungzip(data) # data = data.decode('utf-8') # 得到BeautifulSoup对象 soup = BeautifulSoup(data, 'html5lib') # 计算我的博文总页数 tag = soup.find('a', "p_total") if tag: pagesData = tag.get_text() print(pagesData) # 输出392条 共20页,找到其中的数字 # pagesNum = re.findall(re.compile(pattern=r'共(.*?)页'), pagesData)[0] pagesNum = math.ceil(int(pagesData) / 80) else: pagesNum = 1 return pagesNum # os._exit(0) class Hussheng: def __init__(self,pageIdx=1,url=""): #默认当前页 self.pageIdx = pageIdx # self.url = url[0:url.rfind('/') + 1] + str(pageIdx) self.url = 'http://bbs.voc.com.cn/topic-'+PAGE+'-'+str(pageIdx)+'-1.html' self.headers = { # "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", # "Accept-Encoding":"gzip, deflate", # "Accept-Language":"zh-CN,zh;q=0.8", # "Connection":"keep-alive", "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", # "Host":"bbs.voc.com.cn" } # 求总页数 def getPages(self): req = urllib.request.Request(url=self.url, headers=self.headers) page = urllib.request.urlopen(req) # 从我的csdn博客主页抓取的内容是压缩后的内容,先解压缩 data = page.read() # data = ungzip(data) # data = data.decode('utf-8') # 得到BeautifulSoup对象 soup = BeautifulSoup(data, 'html5lib') # 计算我的博文总页数 tag = soup.find('a', "p_total") if tag: pagesData = tag.get_text() print(pagesData) # 输出392条 共20页,找到其中的数字 # pagesNum = re.findall(re.compile(pattern=r'共(.*?)页'), pagesData)[0] pagesNum = math.ceil(int(pagesData)/20) else: pagesNum = 1 return pagesNum # 设置要抓取的博文页面 def setPage(self, idx,page): # saveImg('http://bbs.voc.com.cn/topic-7652393-'+str(idx+1)+'-1.html'); # self.url = self.url[0:self.url.rfind('/') + 1] + str(idx) self.url = 'http://bbs.voc.com.cn/topic-'+page+'-'+str(idx+1)+'-1.html' # return 'http://bbs.voc.com.cn/topic-7652393-'+str(idx+1)+'-1.html' # 读取博文信息 def readData(self,NAME="默认"): ret = [] req = urllib.request.Request(url=self.url, headers=self.headers) res = urllib.request.urlopen(req) data = res.read() # 也可以把爬取的内容保存到文件中 # saveFile(data) for link, t in set(re.findall(r'(http:[^s]*?(jpg|png|gif))', str(data))): print(link) try: urllib.request.urlretrieve(link, saveImg(link,NAME)) except: print('失败') hsli = list() # # 求取 listpagesNum = int(hsli.getListPages()) # listpagesUrl = int(hsli.getPageUrl()) # print("博文总页数: ", listpagesNum) # print(listpagesUrl); for i in range(listpagesNum): req = urllib.request.Request('http://bbs.voc.com.cn/forum-50-'+str(i+1)+'.html') page = urllib.request.urlopen(req) data = page.read() # 得到BeautifulSoup对象 soup = BeautifulSoup(data, 'html5lib') tag = soup.findAll('a', "a1") for link in tag: NAME = link.get_text() # print(link.get('href')) PAGE = link.get('href').split('-')[1] cs = Hussheng() # 求取 pagesNum = int(cs.getPages()) print("博文总页数: ", pagesNum) for idx in range(pagesNum): cs.setPage(idx, PAGE) cs.readData(NAME) # print("当前页:", idx + 1) # 读取当前页的所有博文,结果为list类型 # papers = cs.readData() # saveFile(papers, idx) # print(imgurl) # for link, t in set(re.findall(r'(http:[^s]*?(jpg|png|gif))', str(data))): # print(link) # try: # urllib.request.urlretrieve(link, saveImg(link)) # except: # print('失败')这个是根据http://blog.csdn.net/fly_yr/article/category/5847099这位的博客改写的,在这里谢谢这位大牛的分享。
发表回复