生活随笔
收集整理的這篇文章主要介紹了
python使用requests+xpath爬取小说并下载
小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
這個(gè)爬蟲只是選定熱門小說,不支持自選搜索下載,日后會補(bǔ)充并改進(jìn)。
選定小說網(wǎng)址:
爬取:
import requests
from lxml
import etree
import os
import re
import time
import datetime
def main_html(url
,headers
): nav_name_href
= "//*[@id='wrapper']/div[2]/ul/li/a//@href"response
= requests
.get
(url
,headers
=headers
).texthtml
= etree
.HTML
(response
)nav_name
= html
.xpath
(nav_name_href
)[2:-1]return nav_name
def next_html(url
,headers
): xpath_book
= '//*[@id="newscontent"]/div[2]/ul/li/span/a/text()' xpath_url
= '//*[@id="newscontent"]/div[2]/ul/li/span/a/@href' xpath_id
= '//*[@id="newscontent"]/div[2]/ul/li/span/text()' name_url
= [] rep
= requests
.get
(url
=url
,headers
=headers
).texthtml
= etree
.HTML
(rep
)r_book
= html
.xpath
(xpath_book
)r_url
= html
.xpath
(xpath_url
)BookAndUrl
= dict(zip(r_book
,r_url
))print(r_book
)try:s_book
= input("============請輸入你想要看的小說全稱============"+"\n")s_url
= BookAndUrl
[s_book
]name_url
.append
(s_book
)name_url
.append
(s_url
)return name_url
except:KeyError
(print("輸入有誤"))return False
def story_html(url
, headers
): xpath_text
= '//*[@id="list"]/dl/dd//text()'xpath_url
= '//*[@id="list"]/dl/dd//a/@href'rep
= requests
.get
(url
=url
,headers
=headers
).texthtml
= etree
.HTML
(rep
)x_text
= html
.xpath
(xpath_text
)x_url
= html
.xpath
(xpath_url
)text_dict
= dict(zip(x_text
[9:-1],x_url
[9:-1])) return text_dict
def download_html(bookname
,url_name_dict
):num_d
= 0 title_Re
= r"[\u7b2c](.|\n)*[\u7ae0]" key_count
= len(url_name_dict
.keys
()) for key
,values
in url_name_dict
.items
():time
.sleep
(0.5) rep
= requests
.get
(url
=values
,headers
=headers
).texthtml
= etree
.HTML
(rep
)xpath2
='//*[@id="content"]//text()'the_end
= html
.xpath
(xpath2
)res
= str(the_end
)r
= re
.findall
(u'[\u4e00-\u9fa5].+?', res
) the_fi
= "".join
(r
)the_final
= list(the_fi
)if not os
.path
.exists
(os
.getcwd
() + "\\" + bookname
): os
.makedirs
(os
.getcwd
() + "\\" + bookname
)if (re
.match
(title_Re
,key
)): num_d
+= 1try:with open(os
.getcwd
() + "\\" + bookname
+ "\\" + str(num_d
)+key
+ ".txt", "w+", encoding
="utf-8") as fp
:for num
in range(len(the_final
)): if(num
%50==0 and num
!=0):fp
.write
("\n" + the_final
[num
])else:fp
.write
(the_final
[num
])print("{:10}{:>15}{:>20}".format(key
,"下載成功","已完成:"+str(num_d
)+"/"+str(key_count
)))except (TimeoutError
,IndexError
):passelse:key_count
= key_count
-1print("{:10}{:>15}".format(key
,"無效章節(jié)","已完成"))
- 寫一個(gè)判斷函數(shù);只你呢個(gè)獲取固定的小說分類;
def is_num(num
): try:new_num
= float(num
)-1if(float(new_num
)):try:if(float(num
)>7):return Falseelif(float(new_num
)<=0):return Falseexcept IndexError
:passexcept (ValueError
,IndexError
):passtry:import unicodedataunicodedata
.numeric
(num
)return Trueexcept (TypeError
, ValueError
,IndexError
):passreturn False
def main_menu():num
= input("""========================按數(shù)字輸入想看的小說類型========================1.玄幻小說 2.修真小說 3.都市小說 4.穿越小說 5.網(wǎng)游小說 6.科幻小說 7.言情小說""")return num
if __name__
== '__main__':start_time
= datetime
.datetime
.now
()main_url
="http://www.b520.cc/"headers
= {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"}num
= main_menu
() main_text_url
= main_html
(url
=main_url
,headers
=headers
) if(is_num
(num
)): num
=int(num
)num
= num
-1text_2_url
= main_text_url
[num
]text_2_url
= "http://"+ text_2_url
[2:-1]+"/"next_url
= next_html
(text_2_url
, headers
=headers
)next_3
=story_html
(next_url
[1],headers
=headers
)download_html
(next_url
[0], next_3
)end_time
= datetime
.datetime
.now
()print("############################共耗時(shí)%ss############################" % (end_time
-start_time
))
這個(gè)程序可以直接復(fù)制運(yùn)行。
總結(jié)
以上是生活随笔為你收集整理的python使用requests+xpath爬取小说并下载的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網(wǎng)站內(nèi)容還不錯(cuò),歡迎將生活随笔推薦給好友。