实践|Spider-豆瓣top500

Spider

准备工作

安装库

终端打开scripts根目录,’.\pip install ***’安装库

常用库: bs4 re urllib xlwt sqlite3

引用库

1
2
3
4
5
6
7
8
9
10
11
12
13
# -*-codeing:utf-8 -*-

import bs4 #网页解析、获取数据
import re #正则表达式
import urllib.request, urllib.error #指定url、获取网页数据
import xlwt #进行excel操作
import sqlite3 #进行sqlit数据库操作

def aom():
print("*" * 10)

if __name__ == "__main__":
aom()

构建流程

baseurl=”https://movie.douban.com/top250?start=0&filter=

豆瓣电影 Top 250

savepath=”E:/py/douban.xls”

  1. 爬取网页

    1
    2
    3
    4
    5
    6
    datalist=GetData(baseurl)

    def GetData(baseurl):
    list=[]
    #逐一解析
    return list
  2. 解析数据

  3. 保存数据

    1
    2

    def SaveData(savepath)

    获取数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
def AskUrl(url):
#head用于伪装、向服务端发消息
head = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66"
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
print(html)
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html

解析数据

补充bs4

bs4将Html文档转换成一个复杂的树形结构,每个节点都是python对象,所有对象可以归纳为4种

  1. -Tag

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    from bs4 import BeautifulSoup

    f = open("E:\\baidu.html", "rb")
    html = f.read()
    bs = BeautifulSoup(html, "html.parser")
    #会把第一个标签打印出来

    print(bs.title)
    print(bs.a)
    print(bs.link)
    print(type(bs.link))

    会拿到第一个标签

  2. -NavigableString

    1
    2
    3
    4
    5
    6
    7
    8
    from bs4 import BeautifulSoup

    f = open("E:\\baidu.html", "rb")
    html = f.read()
    bs = BeautifulSoup(html, "html.parser")

    print(bs.title.string)
    print(type(bs.title.string))

    上面的会打印注释

    下面会打印出字典

    1
    2
    3
    4
    5
    6
    7
    8
    from bs4 import BeautifulSoup

    f = open("E:\\baidu.html", "rb")
    html = f.read()
    bs = BeautifulSoup(html, "html.parser")

    print(bs.a.attrs)
    print(type(bs.a.attrs))
  3. -BeautifulSoup

    1
    2
    3
    4
    5
    6
    7
    from bs4 import BeautifulSoup

    f = open("E:\\baidu.html", "rb")
    html = f.read()
    bs = BeautifulSoup(html, "html.parser")

    print(bs)
  4. -comment

    第一种打印出注释就是comment

节点的获取

文档的遍历

1
2
3
4
5
6
7
from bs4 import BeautifulSoup

f = open("E:\\baidu.html", "rb")
html = f.read()
bs = BeautifulSoup(html, "html.parser")

print(bs.head.contents[0])

contents是个列表,可以通过下标访问元素

文档的搜索

  1. find_all—字符串过滤、会查找与字符串完全一致的标签

    1
    2
    3
    4
    5
    6
    7
    8
    from bs4 import BeautifulSoup

    f = open("E:\\baidu.html", "rb")
    html = f.read().decode("utf-8")
    bs = BeautifulSoup(html, "html.parser")

    t_list = bs.find_all("a")
    print(t_list)
    1. 正则表达式搜索、使用search()方法搜索、把包含a的都拿出来

      1
      2
      3
      4
      5
      6
      7
      8
      9
      from bs4 import BeautifulSoup
      import re

      f = open("E:\\baidu.html", "rb")
      html = f.read().decode("utf-8")
      bs = BeautifulSoup(html, "html.parser")

      t_list = bs.find_all(re.compile("a"))
      print(t_list)
    2. 方法:传入一个函数,根据函数的要求来搜索

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      from bs4 import BeautifulSoup
      import re

      f = open("E:\\baidu.html", "rb")
      html = f.read().decode("utf-8")
      bs = BeautifulSoup(html, "html.parser")

      def is_name_exist(tag):
      return tag.hasattr("name")

      t_list = bs.find_all(is_name_exist)

      print(t_list)
  2. kwargs_____参数

    1
    2
    3
    4
    5
    6
    7
    8
    from bs4 import BeautifulSoup
    import re

    f = open("E:\\baidu.html", "rb")
    html = f.read().decode("utf-8")
    bs = BeautifulSoup(html, "html.parser")

    t_list = bs.find_all(id="head")

    补充正则表达式

史上最全常用正则表达式大全

常用操作符

Re库的使用

Re库的修饰符

1
2
3
4
import re
pat = re.compile("AA") #此处AA是正则表达式
m = pat.search("ABCFFFAA") #被匹配的
print(m) #左闭右开

上面的search只会找到第一个匹配的串,下面是第二种写法,前面是正则表达式

1
2
import re
print(re.search("aa", "fsaa"))

下面是findall

1
2
import re
print(re.findall("aa", "faasaa"))
1
2
import re
print(re.findall("[A-Z]", "faCasBaaAA"))

sub的使用,在第三个字符串中,用第二个字符替换掉第一个字符

1
2
import re
print(re.sub("a", "A", "asdfagfsg"))

建议在正则表达式前面加上r,不用担心转义字符的问题

1
2
import re
print(re.sub("a", r"/A/", "asdfagfsg"))

正则提取

1
2
3
4
5
6
7
findlink = re.compile(r'<a href="(.*?)">')  #创建正则表达式对象、表示规则字符串的模式
findImgsrc = re.compile(r'<img.*src="(.*?)"', re.S)
findMovieName = re.compile(r'<span class="title">(.*)</span>')
findScore = re.compile(
r'<span class="rating_num" property="v:average">(.*)</span>')
findIformation = re.compile(r'<span class="inq">(.*)</span>')
findDitail = re.compile(r'<p class="">(.*)</p>', re.S)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
def GetData(baseurl):
datalist = []
for i in range(0, 1):
url = baseurl + str(i)
html = AskUrl(url) #保存
#解析
soup = bs4.BeautifulSoup(html, "html.parser")
for item in soup.find_all('div', class_="item"):
#print(item) #下面是保存
#break
data = []
item = str(item)
link = re.findall(findlink, item)[0] #通过正则表达式查找字符串,0表示两个里面的第一个
#print(link)
img = re.findall(findImgsrc, item)[0]
#print(img)
name = re.findall(findMovieName, item)[0]
#print(name)
score = re.findall(findScore, item)[0]
print(score)
break
return datalist

标签解析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def GetData(baseurl):
datalist = []
for i in range(0, 1):
url = baseurl + str(i)
html = AskUrl(url) #保存
#解析
soup = bs4.BeautifulSoup(html, "html.parser")
for item in soup.find_all('div', class_="item"):
#print(item) #下面是保存
#break
data = []
item = str(item)
link = re.findall(findlink, item)[0] #通过正则表达式查找字符串,0表示两个里面的第一个
data.append(link)
#print(link)
img = re.findall(findImgsrc, item)[0]
data.append(img)
#print(img)
name = re.findall(findMovieName, item)
if (len(name) == 2):
ctitle = name[0]
data.append(ctitle)
otitle = name[1].replace("\xa0/\xa0", "")
#print(otitle)
data.append(otitle)
else:
data.append(name[0])
data.append(" ")
#print(name)
score = re.findall(findScore, item)[0]
data.append(score)
#print(score)
information = re.findall(findIformation, item)[0]
data.append(information.replace("。", ""))

datalist.append(data)
print(datalist)
return datalist

保存

保存到excel中

1
2
3
4
5
import xlwt
wookbook = xlwt.Workbook()
wooksheet = wookbook.add_sheet("sheet1")
wooksheet.write(0, 0, "Hello")
wookbook.save("text.xls")

下面是九九乘法表,示例python的xlwt的使用

1
2
3
4
5
6
7
import xlwt
wookbook = xlwt.Workbook()
wooksheet = wookbook.add_sheet("sheet1")
for i in range(0, 9):
for j in range(0, i + 1):
wooksheet.write(i, j, "%d*%d=%d" % (i + 1, j + 1, (j + 1) * (i + 1)))
wookbook.save("1.xls")

完整代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# -*-codeing:utf-8 -*-

import bs4 #网页解析、获取数据
import re #正则表达式
import urllib.request, urllib.error #指定url、获取网页数据
import xlwt #进行excel操作
import sqlite3 #进行sqlit数据库操作

baseurl = "https://movie.douban.com/top250?start="

datalist = []

def main():
GetData(baseurl)
SaveData(datalist)

findlink = re.compile(r'<a href="(.*?)">') #创建正则表达式对象、表示规则字符串的模式
findImgsrc = re.compile(r'<img.*src="(.*?)"', re.S)
findMovieName = re.compile(r'<span class="title">(.*)</span>')
findScore = re.compile(
r'<span class="rating_num" property="v:average">(.*)</span>')
findIformation = re.compile(r'<span class="inq">(.*)</span>')

def GetData(baseurl):
for i in range(0, 10):
url = baseurl + str(i)
html = AskUrl(url) #保存
#解析
soup = bs4.BeautifulSoup(html, "html.parser")
for item in soup.find_all('div', class_="item"):
#print(item) #下面是保存
#break
data = []
item = str(item)
link = re.findall(findlink, item)[0] #通过正则表达式查找字符串,0表示两个里面的第一个
data.append(link)
#print(link)
img = re.findall(findImgsrc, item)[0]
data.append(img)
#print(img)
name = re.findall(findMovieName, item)
if (len(name) == 2):
ctitle = name[0]
data.append(ctitle)
otitle = name[1].replace("\xa0/\xa0", "")
#print(otitle)
data.append(otitle)
else:
data.append(name[0])
data.append(" ")
#print(name)
score = re.findall(findScore, item)[0]
data.append(score)
#print(score)
information = re.findall(findIformation, item)[0]
data.append(information.replace("。", ""))

datalist.append(data)
#print(datalist)
return datalist

def AskUrl(url):
#head用于伪装、向服务端发消息
head = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66"
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html

def SaveData(datalist):
wookbook = xlwt.Workbook(encoding="utf-8") #创建
wooksheet = wookbook.add_sheet("sheet1")
for i in range(0, 250):
print("第%d条" % (i + 1))
data = datalist[i]
for j in range(0, 6):
wooksheet.write(i, j, data[j])
wookbook.save("douban.xls")

if __name__ == "__main__":
main()

SQlite部分

结构如下

1
2
3
4
5
6
7
8
9
**import sqlite3
conn = sqlite3.connect("test.db")
print("Successfully open")
c = conn.cursor()
sql = ""
c.execute(sql)
conn.commit()
conn.close()
print("Successfully create")**

本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!