Python发送WEB请求,并对WEB内容进行解析
关键词:python 发送http请求、python 发送post请求、python 发送https请求
import os,sys
import urllib2
import gzip
import StringIO
from datetime import datetime
#get relate keywords by product subject
def getRelateKeywords(requestUrl):
url = requestUrl.replace(‘ ‘, ‘%20’);
page_encode = “gbk”
request = urllib2.Request(url)
request.add_header(“Accept-encoding”, “gzip”)
usock = urllib2.urlopen(request)
page = usock.read()
if usock.headers.get(‘content-encoding’, None) == ‘gzip’:
page = gzip.GzipFile(fileobj=StringIO.StringIO(page)).read()
if not isinstance(page, unicode):
page = unicode(page, page_encode)
#print(page)
#parse the xml file
page = page[page.find(“<![CDATA[{/”en_skw/”:{“):]
relateKeywords = page[:page.find(“</field>”)]
return relateKeywords
#end of getRelateKeywords
#get the result and write it to the file
def writeResultToFile(productSubjectRelKw, targetFilePath):
fileHandler = open(targetFilePath, ‘a’)
fileHandler.write(productSubjectRelKw + ‘/n’)
fileHandler.close()
#end of writeResultToFile
#split the skw result
def getSplitSkwResult(relateKeywords, keywordsType):
if keywordsType == ‘hot’ and relateKeywords.find(“/”hot/”:[/””) > 0:
relateKeywords = relateKeywords[relateKeywords.find(“/”hot/”:[/””) + 8:]
hotKeywordsList = relateKeywords[:relateKeywords.find(“/”]”)]
keywordsList = hotKeywordsList
elif keywordsType == ‘blue’ and relateKeywords.find(“/”blue/”:[/””) > 0:
relateKeywords = relateKeywords[relateKeywords.find(“/”blue/”:[/””) + 9:]
blueKeywordsList = relateKeywords[:relateKeywords.find(“/”]}} ]]>”) – 6]
keywordsList = blueKeywordsList
else:
keywordsList = ”
relateKeywords = ”
return keywordsList
#end of getSplitSkwResult
#get result
def getUrlResult(srcFilePath, targetFilePath):
lineCount = 0
fileHandler = open(srcFilePath, ‘r’)
#print ‘list all lines’
fileHandler.seek(0)
textlist = fileHandler.readlines()
for line in textlist:
productId = line[0: line.find(“,”) + 1]
productSubject = line[line.find(“,”) + 1:]
requestUrl = ‘http://10.20.137.17:30008/bin/smartquery?query=’+productSubject[:-1]+’&resconfig=skw’
relateKeywords = getRelateKeywords(requestUrl)
lineCount = lineCount + 1
if lineCount%100 == 0:
print lineCount
#split the relate keywords
hotKeywordsList = getSplitSkwResult(relateKeywords, ‘hot’)
#print ‘———————hotKeywordsList——————-‘
rank = 0;
if len(hotKeywordsList.strip()) <> 0:
for hotKeyword in hotKeywordsList.split(‘”,”‘):
rank = rank + 1
productSubjectRelKw = productId[:-1] + ‘||’ + productSubject[:-1] + ‘||’ + hotKeyword + ‘||’ + str(rank) + ‘||’ + ‘hot’
writeResultToFile(productSubjectRelKw, targetFilePath)
blueKeywordsList = getSplitSkwResult(relateKeywords, ‘blue’)
rank = 0
#print ‘———————blueKeywordsList——————–‘
if len(blueKeywordsList.strip()) <> 0:
for blueKeyword in blueKeywordsList.split(‘”,”‘):
rank = rank + 1
productSubjectRelKw = productId[:-1] + ‘||’ + productSubject[:-1] + ‘||’ + blueKeyword + ‘||’ + str(rank) + ‘||’ + ‘blue’
writeResultToFile(productSubjectRelKw, targetFilePath)
fileHandler.close()
return lineCount
#end of getUrlResult
def main():
#initialize
start_time = datetime.now()
srcFilePath = sys.argv[1]; #input file
targetFilePath = sys.argv[2]; #output file
count_total = getUrlResult(srcFilePath, targetFilePath)
end_time = datetime.now()
print “/n==================”
print “Time total used : “, ( end_time – start_time )
print “Total: %s,” % (count_total)
print “==================”
#end of main
if __name__ == ‘__main__’:
main()
转载请注明:数据分析 » Python发送WEB请求,并对WEB内容进行解析