16.爬虫技术
大约 1 分钟学习笔记Python基础
获取目录:
import requests
import re
import openpyxl
url = 'https://www.ddxs.cc/ddxs/661/'
book_html = requests.get(url)
book_html.encoding = 'gbk'      # 解决乱码问题
book_name = re.findall('<h1>(.*?)</h1>', book_html.text)
mulu = re.findall('.html">(.*?)</a>', book_html.text)[1:]
mulu_num = re.findall('<a href="/ddxs/661/(.*?).html">', book_html.text)[1:]
mulu_url = []
for i in range(len(mulu)):
    mulu_url.append(f'{url}{mulu_num[i]}.html')
# 写入Excel
mulu_excle = openpyxl.Workbook()
sheet = mulu_excle.active
sheet['A1'] = '标题'
sheet['B1'] = '目录'
number = 2
for i in range(len(mulu)):
    sheet.cell(row=number, column=1).value = mulu[i]
    sheet.cell(row=number, column=2).value = mulu_url[i]
    number += 1
mulu_excle.save('天道图书馆.xlsx')
# Excel 读取
book = openpyxl.load_workbook('天道图书馆.xlsx')
sheet = book.active
number = 2
for i in range(len(mulu)):
    mulu = sheet.cell(row=number, column=1)
    mulu_url = sheet.cell(row=number, column=2)
    number += 1
    print(mulu.value+'  '+mulu_url.value)获取章节内容
import requests
import openpyxl
import re
book = openpyxl.load_workbook('天道图书馆.xlsx')
sheet = book.active
def book(url):
    body_url = url
    content = requests.get(body_url)
    content_url = re.findall('<div id="content">(.*?)</div>', content.text)
    content_body = content_url[0].replace('    ', '\t').replace('<br/>', '\n')
    title_url = re.findall('<h1>(.*?)</h1>', content.text)[0]
    content_title = f'\n\n{title_url}\n\n'
    with open(file='天道图书馆.txt', mode='a', encoding='GBK') as file1:
        file1.write(content_title)
        file1.write(content_body)
num = 2
count = 1
while True:
    mulu_name = sheet.cell(row=num, column=1).value
    mulu_url = sheet.cell(row=num, column=2).value
    if mulu_name is not None:
        num += 1
        print(mulu_name+mulu_url)
        book(mulu_url)
        count += 1
        print(f'第{count}章,爬取完成~~~')
    else:
        print('读取完成!')
        break