1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
| from bs4 import BeautifulSoup import re import urllib.request, urllib.error import xlwt
def main(): baseurl = "https://movie.douban.com/top250?start=" datalist = getData(baseurl)
savepath = "豆瓣电影Top250.xls"
saveData(datalist, savepath)
def getSinglePage(url): head = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" } request = urllib.request.Request(url, headers=head) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) return html
def getData(baseurl): datalist = [] regLink = re.compile(r'<a href="(.*?)">') regImgSrc = re.compile(r'<img.*src="(.*?)"', re.S) regTitle = re.compile(r'<span class="title">(.*?)</span>') regRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>') regCommentNum = re.compile(r'<span>(\d*)人评价</span>') regInfo = re.compile(r'<span class="inq">(.*?)</span>') regPeopleName = re.compile(r'<p class="">(.*?)</p>', re.S)
for i in range(0, 10): url = baseurl + str(i * 25) html = getSinglePage(url) soup = BeautifulSoup(html, "html.parser") for item in soup.find_all('div', class_="item"): itemStr = str(item) filmInfo = []
filmLink = re.findall(regLink, itemStr)[0] imgSrc = re.findall(regImgSrc, itemStr)[0] filmTitle = re.findall(regTitle, itemStr) rating = re.findall(regRating, itemStr)[0] commentNum = re.findall(regCommentNum, itemStr)[0] cTitle = ' ' oTitle = ' ' if len(filmTitle) == 2: cTitle = filmTitle[0] oTitle = filmTitle[1].replace("/", "") else: cTitle = filmTitle[0] filmInq = re.findall(regInfo, itemStr) if len(filmInq) != 0: filmInq = filmInq[0] else: filmInq = " " filmPeopleName = re.findall(regPeopleName, itemStr)[0] filmPeopleName = re.sub(r'<br(\s+)?/>(\s+)?', " ", str(filmPeopleName)).strip()
filmInfo.append(rating) filmInfo.append(cTitle) filmInfo.append(oTitle) filmInfo.append(commentNum) filmInfo.append(filmPeopleName) filmInfo.append(filmInq) filmInfo.append(filmLink) filmInfo.append(imgSrc) datalist.append(filmInfo) print(datalist) return datalist
def saveData(datalist, savepath): workbook = xlwt.Workbook(encoding="utf-8", style_compression=0) worksheet = workbook.add_sheet('第一页', cell_overwrite_ok=True) col = ("评分", "影片中文名", "影片外国名", "评价数", "概况", "引言", "影片详情链接", "影片海报链接") for j in range(0, 8): worksheet.write(0, j, col[j]) for i in range(0, 250): print("第%d条" % i) rowData = datalist[i] for j in range(0, 8): worksheet.write(i+1, j, rowData[j]) workbook.save(savepath)
if __name__ == "__main__": main()
|