在 TestUrl 里有几个错误,第一, urllib.request.read 读出来的数据不是字符串类型(str), 而是字节 (bytes), 需要先用 decode 把他转成字符串,然后才能用 'xxx' in 'yyy'。 另外 print 那里对字符串进行 % 格式化也错了。看改后代码。
# _*_ coding: utf-8 _*_
'''
题目:
有配置文件有类似下面的配置:
http://m.sohu.com/c/5/|财经|200
http://m.sohu.com/|体育|200
http://m.sohu.com/n/346620805/|信号|200
编写一个脚本,要求如下:
1. 读取配置文件,对每一个url进行检查.
例如http://m.sohu.com/c/5/, 这个url返回的结果应该包含"财经"两个字,并且HTTP状态码是200, 否则报错。
2. 需支持并行的检查多个链接
'''
import re
import urllib.request as urllib2
import threading
from os.path import getsize
import os,time
rlock = threading.RLock() #锁
curPosition = 0 #当前读取到的文件位置
'''设计检查url的函数(完成)'''
def TestUrl(line):
m = line.split('|')
url = m[0]
tag = m[1]
#print tag
f = urllib2.urlopen(url)
statuscode = f.getcode()
contents = f.read()
if tag in contents.decode('utf-8') and statuscode == 200:
print('The url %s is true' % url)
else:
print('The url %s is wrong.' % url)
''' 设计支持并行连接的程序'''
class Resource(object):
def __init__(self, fileName):
self.fileName = fileName
#分块大小
self.blockSize = 1000
self.getFileSize()
#计算文件大小
def getFileSize(self): #获取文件有多少行,即多少个url
fstream = open(self.fileName,'r')
fstream.seek(0,os.SEEK_END)
self.fileSize = fstream.tell()
fstream.close()
class Reader(threading.Thread):
def __init__(self, res):
self.res = res
super(Reader,self).__init__()
def run(self):
global curPosition
fstream = open(self.res.fileName, 'r')
while True:
#锁定共享资源
rlock.acquire()
startPosition = curPosition
if (startPosition + self.res.blockSize) < self.res.fileSize:
curPosition = endPosition = (startPosition + self.res.blockSize)
else:
curPosition = endPosition = self.res.fileSize
#释放股共享资源
rlock.release()
if startPosition == self.res.fileSize:
break
elif startPosition != 0:
fstream.seek(startPosition)
fstream.readline()
pos = fstream.tell()
while pos < endPosition:
line = fstream.readline()
#处理line
TestUrl(line)
pos = fstream.tell()
fstream.close()
if __name__ == '__main__':
starttime = time.clock()
#线程数
threadNum = 4
#文件
fileName = 'a.txt'
res = Resource(fileName)
#线程池
threads = []
#初始化线程
for i in range(threadNum):
rdr = Reader(res)
threads.append(rdr)
#开始线程
for i in range(threadNum):
threads[i].start()
#线程结束
for i in range(threadNum):
threads[i].join()