Python抓取数据之华声论坛

发表于

在博库工作了一周时间,前两天主要熟悉博库自己开发的微信公众平台,后三天就是改前置机代码为代理模式,然后通过代理去获取图书馆数据。

期间用了LIB_parse.php 这个函数库去获取页面数据,通过curl模拟请求。

昨天在家休息就用LIB_parse.php这个函数库去写采集规则,当时看到华声论坛的照片挺不错的,就决定拿它练手。

<?php
set_time_limit(0);
include_once ‘LIB_parse.php’;
function get_curl_data($url)
{
    //初始化
    $curl = curl_init();
    //设置抓取的url
    curl_setopt($curl, CURLOPT_URL, $url);
    //设置头文件的信息作为数据流输出
    curl_setopt($curl, CURLOPT_HEADER, 0);
    //设置获取的信息以文件流的形式返回,而不是直接输出。
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
    //执行命令
    $data = curl_exec($curl);
    //关闭URL请求
    curl_close($curl);
    return $data;
}
$data = get_curl_data(‘http://bbs.voc.com.cn/forumdisplay.php?fid=50&orderby=views&page=1’);
$tds  = parse_array($data, ‘<td class=”tbfltd2″‘, ‘</td>’);
foreach ($tds as $k => $v) {
    $as[] = parse_array($v, ‘<a’, ‘</a>’);
}
$as = array_filter($as);
foreach ($as as $k => $v) {
    $res[$k][‘title’] = strip_tags(return_between($v[0], ‘<a’, ‘</a>’, INCL));
    foreach ($v as $k1 => $v1) {
        $res[$k][‘href’][] = get_attribute($v1, ‘href’);
    }
}
// foreach ($res[182][‘href’] as $k => $v) {
//     $detail = get_curl_data(‘http://bbs.voc.com.cn/’ . $v);
//     $detail = parse_array($detail, ‘<div id=”‘, ‘</div>’);
//     foreach ($detail as $k1 => $v1) {
//         $res2[] = get_attribute($v1, ‘src’);
//     }
// }
die;
foreach ($res as $k => $v) {
    foreach ($v[‘href’] as $k1 => $v1) {
        $detail = get_curl_data(‘http://bbs.voc.com.cn/’ . $v1);
        $res2[] = parse_array($detail, ‘<div id=”‘, ‘</div>’);
    }
}
var_dump($res2);die;
foreach ($detail as $k => $v) {
    $res3[] = get_attribute($v2, ‘src’);
}
var_dump($res3);

这段代码是无法正常运行的,因为循环次数过多,会导致页面等待,无限转圈圈。

今天起来后,决定用python试着写下,花了一上午的时间,通过查资料,写规则,完成了简易的采集。

#!/usr/bin/python
# coding:utf-8
import urllib.request,socket,re,sys,os,math,gzip
from bs4 import BeautifulSoup

# url = 'http://bbs.voc.com.cn/topic-7652393-9-1.html'
#
# headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
#                         'Chrome/51.0.2704.63 Safari/537.36'}
# req = urllib.request.Request(url=url,headers=headers)
# res = urllib.request.urlopen(req)
# data = res.read()
# #添加一个解析器
# soup = BeautifulSoup(data,'html5lib')
# print(soup.img)

# 从文档中找到所有<a>标签的内容
# for link in soup.find_all('img'):
#     print(link.get('src'))

#定义文件保存路径
NAME = '';
PAGE = ''

def saveImg(path,NAME):
    # 检测当前路径的有效性
    targetPath = "G:\\python\\images\\" + NAME + ""
    if not os.path.isdir(targetPath):
        os.mkdir(targetPath)

        # 设置每个图片的路径
    pos = path.rindex('/')
    t = os.path.join(targetPath, path[pos + 1:])
    return t


# 定义保存文件函数
def saveFile(data, i):
    path = "E:\\projects\\Spider\\06_csdn2\\papers\\paper_" + str(i + 1) + ".txt"
    file = open(path, 'wb')
    page = '当前页:' + str(i + 1) + '\n'
    file.write(page.encode('gbk'))
    # 将博文信息写入文件(以utf-8保存的文件声明为gbk)
    for d in data:
        d = str(d) + '\n'
        file.write(d.encode('gbk'))
    file.close()

# 解压缩数据
def ungzip(data):
    try:
        # print("正在解压缩...")
        data = gzip.decompress(data)
        # print("解压完毕...")
    except:
        print("未经压缩,无需解压...")
    return data


# 从文档中找到所有文字内容
# print(soup.get_text())

class list:
    def __init__(self,pageIdx=1,url=""):
        #默认当前页
        self.pageIdx = pageIdx
        self.url = 'http://bbs.voc.com.cn/forum-50-1.html'
        self.headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        }

    # 求总页数
    def getListPages(self):
        req = urllib.request.Request(url=self.url, headers=self.headers)
        page = urllib.request.urlopen(req)

        # 从我的csdn博客主页抓取的内容是压缩后的内容,先解压缩
        data = page.read()
        # data = ungzip(data)
        # data = data.decode('utf-8')

        # 得到BeautifulSoup对象
        soup = BeautifulSoup(data, 'html5lib')
        # 计算我的博文总页数
        tag = soup.find('a', "p_total")
        if tag:
            pagesData = tag.get_text()
            print(pagesData)
            # 输出392条  共20页,找到其中的数字
            # pagesNum = re.findall(re.compile(pattern=r'共(.*?)页'), pagesData)[0]
            pagesNum = math.ceil(int(pagesData) / 80)
        else:
            pagesNum = 1
        return pagesNum


# os._exit(0)

class Hussheng:
    def __init__(self,pageIdx=1,url=""):
        #默认当前页
        self.pageIdx = pageIdx
        # self.url = url[0:url.rfind('/') + 1] + str(pageIdx)
        self.url = 'http://bbs.voc.com.cn/topic-'+PAGE+'-'+str(pageIdx)+'-1.html'
        self.headers = {
            # "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            # "Accept-Encoding":"gzip, deflate",
            # "Accept-Language":"zh-CN,zh;q=0.8",
            # "Connection":"keep-alive",
            "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
            # "Host":"bbs.voc.com.cn"
        }

    # 求总页数
    def getPages(self):
        req = urllib.request.Request(url=self.url, headers=self.headers)
        page = urllib.request.urlopen(req)

        # 从我的csdn博客主页抓取的内容是压缩后的内容,先解压缩
        data = page.read()
        # data = ungzip(data)
        # data = data.decode('utf-8')

        # 得到BeautifulSoup对象
        soup = BeautifulSoup(data, 'html5lib')
        # 计算我的博文总页数
        tag = soup.find('a', "p_total")
        if tag:
            pagesData = tag.get_text()
            print(pagesData)
            # 输出392条  共20页,找到其中的数字
            # pagesNum = re.findall(re.compile(pattern=r'共(.*?)页'), pagesData)[0]
            pagesNum = math.ceil(int(pagesData)/20)
        else:
            pagesNum = 1
        return pagesNum

    # 设置要抓取的博文页面
    def setPage(self, idx,page):
        # saveImg('http://bbs.voc.com.cn/topic-7652393-'+str(idx+1)+'-1.html');
        # self.url = self.url[0:self.url.rfind('/') + 1] + str(idx)
        self.url = 'http://bbs.voc.com.cn/topic-'+page+'-'+str(idx+1)+'-1.html'
        # return 'http://bbs.voc.com.cn/topic-7652393-'+str(idx+1)+'-1.html'

    # 读取博文信息
    def readData(self,NAME="默认"):
        ret = []
        req = urllib.request.Request(url=self.url, headers=self.headers)
        res = urllib.request.urlopen(req)
        data = res.read()

        # 也可以把爬取的内容保存到文件中
        # saveFile(data)

        for link, t in set(re.findall(r'(http:[^s]*?(jpg|png|gif))', str(data))):
            print(link)
            try:
                urllib.request.urlretrieve(link, saveImg(link,NAME))
            except:
                print('失败')

hsli = list()
# # 求取
listpagesNum = int(hsli.getListPages())
# listpagesUrl = int(hsli.getPageUrl())
# print("博文总页数: ", listpagesNum)
# print(listpagesUrl);

for i in range(listpagesNum):
    req = urllib.request.Request('http://bbs.voc.com.cn/forum-50-'+str(i+1)+'.html')
    page = urllib.request.urlopen(req)

    data = page.read()

    # 得到BeautifulSoup对象
    soup = BeautifulSoup(data, 'html5lib')
    tag = soup.findAll('a', "a1")

    for link in tag:
        NAME = link.get_text()
        # print(link.get('href'))
        PAGE = link.get('href').split('-')[1]

        cs = Hussheng()
        # 求取
        pagesNum = int(cs.getPages())
        print("博文总页数: ", pagesNum)

        for idx in range(pagesNum):
            cs.setPage(idx, PAGE)
            cs.readData(NAME)






    # print("当前页:", idx + 1)
    # 读取当前页的所有博文,结果为list类型
    # papers = cs.readData()
    # saveFile(papers, idx)
# print(imgurl)
# for link, t in set(re.findall(r'(http:[^s]*?(jpg|png|gif))', str(data))):
#     print(link)
#     try:
#         urllib.request.urlretrieve(link, saveImg(link))
#     except:
#         print('失败')

这个是根据http://blog.csdn.net/fly_yr/article/category/5847099这位的博客改写的,在这里谢谢这位大牛的分享。

评论

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注