Spider
准备工作
安装库
终端打开scripts根目录,’.\pip install ***’安装库
常用库: bs4 re urllib xlwt sqlite3
引用库
1 2 3 4 5 6 7 8 9 10 11 12 13
|
import bs4 import re import urllib.request, urllib.error import xlwt import sqlite3
def aom(): print("*" * 10)
if __name__ == "__main__": aom()
|
构建流程
baseurl=”https://movie.douban.com/top250?start=0&filter=“
豆瓣电影 Top 250
savepath=”E:/py/douban.xls”
爬取网页
1 2 3 4 5 6
| datalist=GetData(baseurl)
def GetData(baseurl): list=[]
return list
|
解析数据
保存数据
获取数据
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
| def AskUrl(url): head = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66" } request = urllib.request.Request(url, headers=head) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") print(html) except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) return html
|
解析数据
补充bs4
bs4将Html文档转换成一个复杂的树形结构,每个节点都是python对象,所有对象可以归纳为4种
-Tag
1 2 3 4 5 6 7 8 9 10 11
| from bs4 import BeautifulSoup
f = open("E:\\baidu.html", "rb") html = f.read() bs = BeautifulSoup(html, "html.parser")
print(bs.title) print(bs.a) print(bs.link) print(type(bs.link))
|
会拿到第一个标签
-NavigableString
1 2 3 4 5 6 7 8
| from bs4 import BeautifulSoup
f = open("E:\\baidu.html", "rb") html = f.read() bs = BeautifulSoup(html, "html.parser")
print(bs.title.string) print(type(bs.title.string))
|
上面的会打印注释
下面会打印出字典
1 2 3 4 5 6 7 8
| from bs4 import BeautifulSoup
f = open("E:\\baidu.html", "rb") html = f.read() bs = BeautifulSoup(html, "html.parser")
print(bs.a.attrs) print(type(bs.a.attrs))
|
-BeautifulSoup
1 2 3 4 5 6 7
| from bs4 import BeautifulSoup
f = open("E:\\baidu.html", "rb") html = f.read() bs = BeautifulSoup(html, "html.parser")
print(bs)
|
-comment
第一种打印出注释就是comment
节点的获取
文档的遍历
1 2 3 4 5 6 7
| from bs4 import BeautifulSoup
f = open("E:\\baidu.html", "rb") html = f.read() bs = BeautifulSoup(html, "html.parser")
print(bs.head.contents[0])
|
contents是个列表,可以通过下标访问元素
文档的搜索
find_all—字符串过滤、会查找与字符串完全一致的标签
1 2 3 4 5 6 7 8
| from bs4 import BeautifulSoup
f = open("E:\\baidu.html", "rb") html = f.read().decode("utf-8") bs = BeautifulSoup(html, "html.parser")
t_list = bs.find_all("a") print(t_list)
|
正则表达式搜索、使用search()方法搜索、把包含a的都拿出来
1 2 3 4 5 6 7 8 9
| from bs4 import BeautifulSoup import re
f = open("E:\\baidu.html", "rb") html = f.read().decode("utf-8") bs = BeautifulSoup(html, "html.parser")
t_list = bs.find_all(re.compile("a")) print(t_list)
|
方法:传入一个函数,根据函数的要求来搜索
1 2 3 4 5 6 7 8 9 10 11 12 13
| from bs4 import BeautifulSoup import re
f = open("E:\\baidu.html", "rb") html = f.read().decode("utf-8") bs = BeautifulSoup(html, "html.parser")
def is_name_exist(tag): return tag.hasattr("name")
t_list = bs.find_all(is_name_exist)
print(t_list)
|
kwargs_____参数
1 2 3 4 5 6 7 8
| from bs4 import BeautifulSoup import re
f = open("E:\\baidu.html", "rb") html = f.read().decode("utf-8") bs = BeautifulSoup(html, "html.parser")
t_list = bs.find_all(id="head")
|
补充正则表达式
史上最全常用正则表达式大全
常用操作符
Re库的使用
Re库的修饰符
1 2 3 4
| import re pat = re.compile("AA") m = pat.search("ABCFFFAA") print(m)
|
上面的search只会找到第一个匹配的串,下面是第二种写法,前面是正则表达式
1 2
| import re print(re.search("aa", "fsaa"))
|
下面是findall
1 2
| import re print(re.findall("aa", "faasaa"))
|
1 2
| import re print(re.findall("[A-Z]", "faCasBaaAA"))
|
sub的使用,在第三个字符串中,用第二个字符替换掉第一个字符
1 2
| import re print(re.sub("a", "A", "asdfagfsg"))
|
建议在正则表达式前面加上r,不用担心转义字符的问题
1 2
| import re print(re.sub("a", r"/A/", "asdfagfsg"))
|
正则提取
1 2 3 4 5 6 7
| findlink = re.compile(r'<a href="(.*?)">') findImgsrc = re.compile(r'<img.*src="(.*?)"', re.S) findMovieName = re.compile(r'<span class="title">(.*)</span>') findScore = re.compile( r'<span class="rating_num" property="v:average">(.*)</span>') findIformation = re.compile(r'<span class="inq">(.*)</span>') findDitail = re.compile(r'<p class="">(.*)</p>', re.S)
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
| def GetData(baseurl): datalist = [] for i in range(0, 1): url = baseurl + str(i) html = AskUrl(url) soup = bs4.BeautifulSoup(html, "html.parser") for item in soup.find_all('div', class_="item"): data = [] item = str(item) link = re.findall(findlink, item)[0] img = re.findall(findImgsrc, item)[0] name = re.findall(findMovieName, item)[0] score = re.findall(findScore, item)[0] print(score) break return datalist
|
标签解析
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
| def GetData(baseurl): datalist = [] for i in range(0, 1): url = baseurl + str(i) html = AskUrl(url) soup = bs4.BeautifulSoup(html, "html.parser") for item in soup.find_all('div', class_="item"): data = [] item = str(item) link = re.findall(findlink, item)[0] data.append(link) img = re.findall(findImgsrc, item)[0] data.append(img) name = re.findall(findMovieName, item) if (len(name) == 2): ctitle = name[0] data.append(ctitle) otitle = name[1].replace("\xa0/\xa0", "") data.append(otitle) else: data.append(name[0]) data.append(" ") score = re.findall(findScore, item)[0] data.append(score) information = re.findall(findIformation, item)[0] data.append(information.replace("。", ""))
datalist.append(data) print(datalist) return datalist
|
保存
保存到excel中
1 2 3 4 5
| import xlwt wookbook = xlwt.Workbook() wooksheet = wookbook.add_sheet("sheet1") wooksheet.write(0, 0, "Hello") wookbook.save("text.xls")
|
下面是九九乘法表,示例python的xlwt的使用
1 2 3 4 5 6 7
| import xlwt wookbook = xlwt.Workbook() wooksheet = wookbook.add_sheet("sheet1") for i in range(0, 9): for j in range(0, i + 1): wooksheet.write(i, j, "%d*%d=%d" % (i + 1, j + 1, (j + 1) * (i + 1))) wookbook.save("1.xls")
|
完整代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
|
import bs4 import re import urllib.request, urllib.error import xlwt import sqlite3
baseurl = "https://movie.douban.com/top250?start="
datalist = []
def main(): GetData(baseurl) SaveData(datalist)
findlink = re.compile(r'<a href="(.*?)">') findImgsrc = re.compile(r'<img.*src="(.*?)"', re.S) findMovieName = re.compile(r'<span class="title">(.*)</span>') findScore = re.compile( r'<span class="rating_num" property="v:average">(.*)</span>') findIformation = re.compile(r'<span class="inq">(.*)</span>')
def GetData(baseurl): for i in range(0, 10): url = baseurl + str(i) html = AskUrl(url) soup = bs4.BeautifulSoup(html, "html.parser") for item in soup.find_all('div', class_="item"): data = [] item = str(item) link = re.findall(findlink, item)[0] data.append(link) img = re.findall(findImgsrc, item)[0] data.append(img) name = re.findall(findMovieName, item) if (len(name) == 2): ctitle = name[0] data.append(ctitle) otitle = name[1].replace("\xa0/\xa0", "") data.append(otitle) else: data.append(name[0]) data.append(" ") score = re.findall(findScore, item)[0] data.append(score) information = re.findall(findIformation, item)[0] data.append(information.replace("。", ""))
datalist.append(data) return datalist
def AskUrl(url): head = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66" } request = urllib.request.Request(url, headers=head) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) return html
def SaveData(datalist): wookbook = xlwt.Workbook(encoding="utf-8") wooksheet = wookbook.add_sheet("sheet1") for i in range(0, 250): print("第%d条" % (i + 1)) data = datalist[i] for j in range(0, 6): wooksheet.write(i, j, data[j]) wookbook.save("douban.xls")
if __name__ == "__main__": main()
|
SQlite部分
结构如下
1 2 3 4 5 6 7 8 9
| **import sqlite3 conn = sqlite3.connect("test.db") print("Successfully open") c = conn.cursor() sql = "" c.execute(sql) conn.commit() conn.close() print("Successfully create")**
|