建议参加者:
程序经验不丰富的培训参加者。
课程内容简介:
完善查排名的代码,并给它加些功能来监控自己和竞争对手的排名。
课后:
复习这些天的课程,不一定是自己写出来,可以复制一段段的代码拼出最终功能,但需要做到能够流畅的写出代码。
上半节
windows里面的sys.argv[1]巧用–拖拽文件。(需要与raw_input()一起用)
代码1:
建议参加者:
程序经验不丰富的培训参加者。
课程内容简介:
完善查排名的代码,并给它加些功能来监控自己和竞争对手的排名。
课后:
复习这些天的课程,不一定是自己写出来,可以复制一段段的代码拼出最终功能,但需要做到能够流畅的写出代码。
上半节
windows里面的sys.argv[1]巧用–拖拽文件。(需要与raw_input()一起用)
代码1:
01
import re, urllib2, urllib, sys
02
03
def get_urls(kw):
04
html = urllib2.urlopen(‘http://www.baidu.com/s?wd=%s&rn=100’ % urllib.quote_plus(kw)).read()
05
urls = re.findall(‘\s*(.+?) ‘, html)
06
urls = [ ‘http://’ + url.replace(”, ”).replace(”, ”) for url in urls ]
07
return urls
08
09
input_file = sys.argv[1]
10
self_domain = sys.argv[2]
11
competitor_domain = sys.argv[3]
12
13
self = 0
14
competitor = 0
15
for line in open(input_file):
16
kw = line.rstrip()
17
urls = get_urls(kw)
18
19
self_find = False
20
competitor_find = False
21
for pos, url in enumerate(urls, 1):
22
if not self_find and self_domain in url:
23
self += 100 – pos
24
self_find = True
25
if not competitor_find and competitor_domain in url:
26
competitor += 100 – pos
27
competitor_find = True
28
print self, competitor
下半节
01
import re, urllib2, urllib, sys, time
02
03
date = time.strftime(‘%Y-%m-%d’)
04
05
rank_score = [
06
28.56,
07
19.23,
08
10.20,
09
8.14,
10
7.50,
11
5.72,
12
4.01,
13
4.41,
14
5.53,
15
6.70,
16
]
17
18
def get_urls(kw):
19
while 1:
20
html = urllib2.urlopen(‘http://www.baidu.com/s?wd=%s&rn=100’ % urllib.quote_plus(kw)).read()
21
if ‘=”http://verify.baidu.com/’ in html:
22
print ‘captcha’
23
time.sleep(600)
24
continue
25
else:
26
break
27
urls = re.findall(‘\s*(.+?) ‘, html)
28
urls = [ ‘http://’ + url for url in urls ]
29
return urls
30
31
input_file = sys.argv[1]
32
self_domain = sys.argv[2]
33
competitor_domain = sys.argv[3]
34
35
f = open(‘rank_report.csv’, ‘a’)
36
37
self = 0
38
competitor = 0
39
for line in open(input_file):
40
kw = line.rstrip()
41
urls = get_urls(kw)
42
43
self_find = False
44
competitor_find = False
45
for pos, url in enumerate(urls, 1):
46
if not self_find and self_domain in url:
47
if pos<=10:
48
self += rank_score[pos-1]
49
self_find = True
50
if not competitor_find and competitor_domain in url:
51
if pos<=10:
52
competitor += rank_score[pos-1]
53
competitor_find = True
54
f.write(‘%s\t%d\t%d\r\n’ % (date, self, competitor))
f.close()
上节课后习题参考答案:
01
import sys, urllib, urllib2, re
02
03
kw = sys.argv[1] # keyword
04
host = sys.argv[2]
05
06
rank = -1
07
for pn in range(1, 10+1):
08
html = urllib2.urlopen(‘http://www.so.com/s?q=%s&pn=%d’ % (urllib.quote_plus(kw),pn)).read()
09
urls = re.findall(‘class=”res-list”>[\s\S]*?, html)
10
for pos, url in enumerate(urls, 1):
11
if host in url:
12
rank = pos + (pn-1)*10
13
break
14
if rank!=-1:
15
break
16
print rank
01
# coding:utf-8
02
import sys, urllib, urllib2
03
def is_indexed(url):
04
while 1:
05
html = urllib2.urlopen(‘http://www.baidu.com/s?wd=%s’ % urllib.quote_plus(url)).read()
06
if ‘=”http://verify.baidu.com’ in html:
07
print ‘captcha’
08
sleep(600)
09
continue
10
if ‘class=f14>没有找到该URL。您可以直接访问’ in html or ‘class=”nors”>
抱歉,没有找到与’ in html:
11
return 0
12
else:
13
return 1
14
15
total = 0
16
count = 0
17
for line in open(sys.argv[1]):
18
url = line.rstrip()
19
indexed = is_indexed(url)
20
print url, indexed
21
22
if indexed==1:
23
count += 1
24
total += 1
25
26
print ‘收录率:%d%%’ % (count * 100 / total)
print self, competitor
下半节
01
import re, urllib2, urllib, sys, time
02
03
date = time.strftime(‘%Y-%m-%d’)
04
05
rank_score = [
06
28.56,
07
19.23,
08
10.20,
09
8.14,
10
7.50,
11
5.72,
12
4.01,
13
4.41,
14
5.53,
15
6.70,
16
]
17
18
def get_urls(kw):
19
while 1:
20
html = urllib2.urlopen(‘http://www.baidu.com/s?wd=%s&rn=100’ % urllib.quote_plus(kw)).read()
21
if ‘=”http://verify.baidu.com/’ in html:
22
print ‘captcha’
23
time.sleep(600)
24
continue
25
else:
26
break
27
urls = re.findall(‘\s*(.+?) ‘, html)
28
urls = [ ‘http://’ + url for url in urls ]
29
return urls
30
31
input_file = sys.argv[1]
32
self_domain = sys.argv[2]
33
competitor_domain = sys.argv[3]
34
35
f = open(‘rank_report.csv’, ‘a’)
36
37
self = 0
38
competitor = 0
39
for line in open(input_file):
40
kw = line.rstrip()
41
urls = get_urls(kw)
42
43
self_find = False
44
competitor_find = False
45
for pos, url in enumerate(urls, 1):
46
if not self_find and self_domain in url:
47
if pos<=10:
48
self += rank_score[pos-1]
49
self_find = True
50
if not competitor_find and competitor_domain in url:
51
if pos<=10:
52
competitor += rank_score[pos-1]
53
competitor_find = True
54
f.write(‘%s\t%d\t%d\r\n’ % (date, self, competitor))
f.close()
上节课后习题参考答案:
01
import sys, urllib, urllib2, re
02
03
kw = sys.argv[1] # keyword
04
host = sys.argv[2]
05
06
rank = -1
07
for pn in range(1, 10+1):
08
html = urllib2.urlopen(‘http://www.so.com/s?q=%s&pn=%d’ % (urllib.quote_plus(kw),pn)).read()
09
urls = re.findall(‘class=”res-list”>[\s\S]*?, html)
10
for pos, url in enumerate(urls, 1):
11
if host in url:
12
rank = pos + (pn-1)*10
13
break
14
if rank!=-1:
15
break
16
print rank
01
# coding:utf-8
02
import sys, urllib, urllib2
03
def is_indexed(url):
04
while 1:
05
html = urllib2.urlopen(‘http://www.baidu.com/s?wd=%s’ % urllib.quote_plus(url)).read()
06
if ‘=”http://verify.baidu.com’ in html:
07
print ‘captcha’
08
sleep(600)
09
continue
10
if ‘class=f14>没有找到该URL。您可以直接访问’ in html or ‘class=”nors”>
抱歉,没有找到与’ in html:
11
return 0
12
else:
13
return 1
14
15
total = 0
16
count = 0
17
for line in open(sys.argv[1]):
18
url = line.rstrip()
19
indexed = is_indexed(url)
20
print url, indexed
21
22
if indexed==1:
23
count += 1
24
total += 1
25
26
print ‘收录率:%d%%’ % (count * 100 / total)