爬虫:将数据存储到文件及数据库(某乎及某吧)
发布日期:2021-05-20 22:24:09 浏览次数:13 分类:博客文章

本文共 8755 字,大约阅读时间需要 29 分钟。

  注:本文代码中的cookie都需要替换为读者自己的cookie

1.  将数据导出到文本文档 

1.1 测试代码

import requestsfrom bs4 import BeautifulSoupfrom requests.exceptions import RequestExceptionimport timeheaders = {    'cookie': '',    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}           # 替换为自己的cookiedef get_page(url):    try:        html = requests.get(url, headers=headers, timeout=5)        if html.status_code == 200:            print('请求成功')            return html.text        else:   # 这个else语句不是必须的            return None    except RequestException:        print('请求失败')def parse_page(html):    html = BeautifulSoup(html, "html.parser")    titles = html.find_all("h2", {'class': 'HotItem-title'})[:10]    links = html.find_all('div', {"class": "HotItem-content"})[:10]    hot_values = html.find_all('div',{"class": "HotItem-content"})[:10]    texts = html.find_all('div', {"class": "HotItem-content"})[:10]    return titles, links, hot_values, texts  # , title_linksdef store_data(titles, links, hot_values, texts):    with open('热榜测试.txt', 'a') as f:        f.write('+'*80 + '\n')        f.write(time.asctime().center(80) + '\n')        f.write('+'*80 + '\n'*2)        index = 1        for title, link, hot_value, text in zip(titles, links, hot_values, texts):            print(title.get_text(), '\n', link.a.attrs['href'], '\n', hot_value.div.get_text().replace('\u200b', ''))            f.write(str(index) + ': ' + title.get_text() + '\n')            f.write('   ' + link.a.attrs['href'] + '\n')            f.write('   ' + hot_value.div.get_text().replace('\u200b', ' ') + '\n')  # '\u200b'不可见字符            if text.p is None:                f.write('   ' + 'None:网页没有显示文章内容' + '\n')            else:                f.write('   ' + text.p.get_text() + '\n')            f.write('\n')            index += 1if __name__ == '__main__':    input_url = 'https://www.zhihu.com/hot'    a, b, c, d, = parse_page(get_page(input_url))    store_data(a, b, c, d)

1.2 结果截图

2. 将数据导出到Excel

2.1  测试代码示例

# coding='utf-8'import requestsfrom bs4 import BeautifulSoupimport openpyxlfrom requests.exceptions import RequestExceptionimport reimport datetimedef get_page(url):    try:        html = requests.get(url)        if html.status_code == 200:            # print(html.text)            return html.text        else:            return None    except RequestException:        print('请求失败')def parse_page(html):    html = BeautifulSoup(html, 'html.parser')    topic_items = html.find_all('div', {"class": "topic-name"})    topic_values = html.find_all('span', {"class": "topic-num"})    topic_statements = html.find_all('div', {"class": "topic-info"})    topic_imgs = html.find_all('li', {"class": "topic-top-item"})    return topic_items, topic_values, topic_statements, topic_imgsdef store_data(topic_items, topic_values, topic_statements, topic_imgs):    regex = re.compile(r'\d+(\.\d+)?')    wb = openpyxl.load_workbook('贴吧热榜Excel.xlsx')    sheet = wb['Sheet1']    sheet.freeze_panes = 'A2'    for item, value, statement, img in zip(topic_items, topic_values, topic_statements, topic_imgs):        print(item.a.get_text(), '\n', item.a['href'], '\n', float(regex.search(value.get_text()).group()), '万')        sheet.append([item.a.get_text(), item.a['href'], float(regex.search(value.get_text()).group()), statement.p.get_text(), img.img['src'], datetime.datetime.now()])    wb.save('贴吧热榜Excel.xlsx')def main(url):    html = get_page(url)    topic_items, topic_values, topic_statements, topic_imgs = parse_page(html)    store_data(topic_items, topic_values, topic_statements, topic_imgs)if __name__ == '__main__':    input_url = 'http://tieba.baidu.com/hottopic/browse/topicList?res_type=1&red_tag=i0626384809'    main(input_url)

 

2.2 结果截图

 

 

3. 将数据写入mysql

3.1 创建数据库

import pymysqldb = pymysql.connect(host='localhost', user='root', password='', port=3306)  # 替换为自己的密码cursor = db.cursor()sql = "CREATE DATABASE IF NOT EXISTS test_db"cursor.execute(sql)db.close()

3.1.1 结果截图

  打开Navicat便可看到刚刚创建的数据库

 

 

3.2  创建表

  下面这个mysql语句将会在以后添加记录的同时自动填充记录的创建时间和更新时间,一般以id为主键,但笔者在这里尝试以title为主键

import pymysql db = pymysql.connect(host='localhost', user='root', password='', port=3306, db='test_db')  # 替换为自己的密码cursor = db.cursor()sql = """        CREATE TABLE IF NOT EXISTS hot_lists (title VARCHAR(255) NOT NULL, link VARCHAR(255)        NOT NULL, value_thousand float(6,1) NOT NULL, content VARCHAR(10000) NOT NULL, create_time timestamp         NOT NULL DEFAULT CURRENT_TIMESTAMP, update_time timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,        PRIMARY KEY (title))      """cursor.execute(sql)db.close()

3.2.2  结果截图

 

 

3.3  实践代码测试

import requestsfrom bs4 import BeautifulSoupimport pymysqlfrom requests.exceptions import RequestExceptionimport randomimport datetimedef get_page(url):    headers = {        'cookie': '',        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'    }   # 替换为自己的cookie    try:        html = requests.get(url, headers=headers, timeout=5)        # print(html.text)        # print(BeautifulSoup(html.text, "html.parser"))        return html.text    except RequestException:        print('请求失败')def parse_page(html):    html = BeautifulSoup(html, "html.parser")    titles = html.find_all("h2", {'class': 'HotItem-title'})[:10]    links = html.find_all('div', {"class": "HotItem-content"})[:10]    hot_values = html.find_all('div', {"class": "HotItem-content"})[:10]    texts = html.find_all('div', {"class": "HotItem-content"})[:10]    return titles, links, hot_values, texts  # , title_linksdef store_data(titles, links, hot_values, texts):    con = pymysql.connect(host='localhost', user='root', password='120888', port=3306, db='test_db')    cur = con.cursor()    sql = 'INSERT INTO hot_lists (title, link, value_thousand, content) VALUES (%s, %s, %s, %s)'    for title, link, hot_value, text in zip(titles, links, hot_values, texts):        try:            if text.p is None:                cur.execute(sql, (title.get_text(), link.a.attrs['href'], float(hot_value.div.get_text().replace('\u200b', ' ').split()[0])*10, 'None'))                con.commit()            else:                cur.execute(sql, (title.get_text(), link.a.attrs['href'], float(hot_value.div.get_text().replace('\u200b', ' ').split()[0])*10, text.p.get_text()))                con.commit()        except:            print('Failed')            con.rollback()def main():    url = 'https://www.zhihu.com/hot'    html = get_page(url)    titles, links, hot_values, texts = parse_page(html)    store_data(titles, links, hot_values, texts)if __name__ == '__main__':    random.seed(datetime.datetime.now())    main()

3.3.1  结果截图

 

 

4. 将数据写入mongodb

  mongodb操作比mysql简单,不用事先创建数据库和表

4.1 测试代码

import requestsfrom bs4 import BeautifulSoupimport pymysqlfrom requests.exceptions import RequestExceptionimport randomimport datetimefrom pymongo import MongoClientclient = MongoClient('localhost')db = client['test_db']def get_page(url):    headers = {        "cookie": '',        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'    }   # 替换为自己的cookie    try:        html = requests.get(url, headers=headers, timeout=5)        # print(html.text)        # print(BeautifulSoup(html.text, "html.parser"))        return html.text    except RequestException:        print('请求失败')def parse_page(html):    html = BeautifulSoup(html, "html.parser")    titles = html.find_all("h2", {'class': 'HotItem-title'})[:10]    links = html.find_all('div', {"class": "HotItem-content"})[:10]    hot_values = html.find_all('div', {"class": "HotItem-content"})[:10]    texts = html.find_all('div', {"class": "HotItem-content"})[:10]    return titles, links, hot_values, texts  # , title_linksdef store_data(titles, links, hot_values, texts):    for title, link, hot_value, text in zip(titles, links, hot_values, texts):        try:            if text.p is None:                db['hot_lists'].insert({"title": title.get_text(), "link": link.a.attrs['href'], "value_thousand": float(hot_value.div.get_text().replace('\u200b', ' ').split()[0])*10,"content": 'None'})            else:                db['hot_lists'].insert({"title": title.get_text(), "link": link.a.attrs['href'], "value_thousand": float(hot_value.div.get_text().replace('\u200b', ' ').split()[0]) * 10, "content": text.p.get_text()})        except:            print('Failed')def main():    url = 'https://www.zhihu.com/hot'    html = get_page(url)    titles, links, hot_values, texts = parse_page(html)    store_data(titles, links, hot_values, texts)if __name__ == '__main__':    random.seed(datetime.datetime.now())    main()

 

4.2  结果截图

 

上一篇:爬虫:HTTP请求与HTML解析(爬取某乎网站)
下一篇:opencv——轮廓发现与轮廓分析

发表评论

最新留言

表示我来过!
[***.240.166.169]2025年04月15日 10时55分38秒