python zhihu-spider

Author Avatar
Xzhah 11月 24, 2017
  • 在其它设备中阅读本文章

这个爬虫并不是我写的,是找一个师傅要的。自己看懂了过后把代码贴上来加上自己的一些领悟。

* 成果

* 完整代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# -*- coding:utf-8 -*-

from selenium import webdriver
import time

import urllib
import urllib2
from bs4 import BeautifulSoup

from HTMLParser import HTMLParser

import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )


def main():
# ****************** selenium Operations ***********************

driver = webdriver.Chrome() # open with Chrome

driver.get("https://www.zhihu.com/question/37787176") # 当一个颜值很高的程序员是怎样一番体验?


# ****************** Scroll to the bottom, and do it 10 times *********
def execute_times(times):

for i in range(times + 1):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
try:
driver.find_element_by_css_selector('button.QuestionMainAction').click()
print "page" + str(i)
time.sleep(1)
except:
break

execute_times(15)


result_raw = driver.page_source
result_soup = BeautifulSoup(result_raw, 'html.parser')

result_bf = result_soup.prettify()

# **************** Store raw data file *****************************************
with open("./output/rawfile/raw_result.txt", 'w') as girls:
girls.write(result_bf)
girls.close()
print "Store raw data successfully!!!"

# **************** Find all nodes that we want *****************************************
with open("./output/rawfile/noscript_meta.txt", 'w') as noscript_meta:
noscript_nodes = result_soup.find_all('noscript')
noscript_inner_all = ""
for noscript in noscript_nodes:
noscript_inner = noscript.get_text()
noscript_inner_all += noscript_inner + "\n"

h = HTMLParser()
noscript_all = h.unescape(noscript_inner_all)
noscript_meta.write(noscript_all)

noscript_meta.close()
print "Store noscript meta data successfully!!!"

# **************** Store meta data of imgs *****************************************
img_soup = BeautifulSoup(noscript_all, 'html.parser')
img_nodes = img_soup.find_all('img')
with open("./output/rawfile/img_meta.txt", 'w') as img_meta:
count = 0
for img in img_nodes:
if img.get('src') is not None:
img_url = img.get('src')

line = str(count) + "\t" + img_url + "\n"
img_meta.write(line)
urllib.urlretrieve(img_url, "./output/image/" + str(count) + ".jpg")
count += 1

img_meta.close()
print "Store meta data and images successfully!!!"

if __name__ == '__main__':
main()

* 翻页

这部分是不断跳至页面最底部点击查看更多回答

* 读入网页源码并写入raw_result.text

* 筛选noscript段(img的部分)

* 下载图片

* 感悟

日渐消瘦