16.爬虫技术
大约 1 分钟学习笔记Python基础
获取目录:
import requests
import re
import openpyxl
url = 'https://www.ddxs.cc/ddxs/661/'
book_html = requests.get(url)
book_html.encoding = 'gbk' # 解决乱码问题
book_name = re.findall('<h1>(.*?)</h1>', book_html.text)
mulu = re.findall('.html">(.*?)</a>', book_html.text)[1:]
mulu_num = re.findall('<a href="/ddxs/661/(.*?).html">', book_html.text)[1:]
mulu_url = []
for i in range(len(mulu)):
mulu_url.append(f'{url}{mulu_num[i]}.html')
# 写入Excel
mulu_excle = openpyxl.Workbook()
sheet = mulu_excle.active
sheet['A1'] = '标题'
sheet['B1'] = '目录'
number = 2
for i in range(len(mulu)):
sheet.cell(row=number, column=1).value = mulu[i]
sheet.cell(row=number, column=2).value = mulu_url[i]
number += 1
mulu_excle.save('天道图书馆.xlsx')
# Excel 读取
book = openpyxl.load_workbook('天道图书馆.xlsx')
sheet = book.active
number = 2
for i in range(len(mulu)):
mulu = sheet.cell(row=number, column=1)
mulu_url = sheet.cell(row=number, column=2)
number += 1
print(mulu.value+' '+mulu_url.value)
获取章节内容
import requests
import openpyxl
import re
book = openpyxl.load_workbook('天道图书馆.xlsx')
sheet = book.active
def book(url):
body_url = url
content = requests.get(body_url)
content_url = re.findall('<div id="content">(.*?)</div>', content.text)
content_body = content_url[0].replace(' ', '\t').replace('<br/>', '\n')
title_url = re.findall('<h1>(.*?)</h1>', content.text)[0]
content_title = f'\n\n{title_url}\n\n'
with open(file='天道图书馆.txt', mode='a', encoding='GBK') as file1:
file1.write(content_title)
file1.write(content_body)
num = 2
count = 1
while True:
mulu_name = sheet.cell(row=num, column=1).value
mulu_url = sheet.cell(row=num, column=2).value
if mulu_name is not None:
num += 1
print(mulu_name+mulu_url)
book(mulu_url)
count += 1
print(f'第{count}章,爬取完成~~~')
else:
print('读取完成!')
break