python 正则表达式匹配中文utf8 |
|
python 正则表达式匹配中文utf8 |
>>> import re
>>> x = u'中国'.encode('utf8')
>>> s = x.__repr__()
>>> s
"'\\xc3\\x96\\xc3\\x90\\xc2\\xb9\\xc3\\xba'"
>>> pattern = '.*%s.*' % s[1:-1]
>>> pattern
'.*\\xc3\\x96\\xc3\\x90\\xc2\\xb9\\xc3\\xba.*'
>>> r = re.match(pattern,u'我是中国人'.encode('utf8'))
>>> r
<_sre.SRE_Match object at 0x012A52C0>
>>> dir(r)
['__copy__', '__deepcopy__', 'end', 'expand', 'group', 'groupdict', 'groups', 'span', 'start']
>>> r.group()
'\xc3\x8e\xc3\x92\xc3\x8a\xc3\x87\xc3\x96\xc3\x90\xc2\xb9\xc3\xba\xc3\x88\xc3\x8b'
>>>
|
新闻抓取爬虫 PPT |
|
新闻抓取爬虫 PPT |
#encoding = utf8
import re
import sys,os
import re
chars = []
def generate_chars(strList):
"""
创建字典
"""
for sr in strList:
for char in sr:
if char not in chars:
chars.append(char)
def vector(sr):
"""
将字符串转换为向量空间
"""
vec = []
for char in chars:
if sr.find(char) !=-1:
vec.append(1)
else:
vec.append(0)
return vec
def intersaction(str1,str2):
"""
计算交集
"""
v1 =vector(str1)
v2= vector(str2)
rv = 0
for i in xrange(len(v1)):
rv += v1[i] * v2[i]
return rv
def detect_repeat(strList):
"""
检测字符串字符匹配
"""
chars = generate_chars(strList)
#临界值
critical = 3
results = []
for st in strList:
if all(intersaction(st,rt) < critical for rt in results):
results.append(st)
#返回检测结果
return results
if __name__ =="__main__":
strLists = [u"大家好啊",u"大家好",u"张靓颖新专辑",u"张靓颖专辑"]
results = detect_repeat(strLists)
print "result " + " > " * 20
for ru in results:
print ru.encode("utf-8")
|
Python保存二进制数据到sqlite3 |
|
Python保存二进制数据到sqlite3 |
# coding: utf8
# Python2.6.2
import sqlite3
db = sqlite3.connect('test.db')
cur = db.cursor()
cur.execute("CREATE TABLE if not exists t (b BLOB);")
with open('0.bin', 'rb') as f:
cur.execute("insert into t values(?)", (sqlite3.Binary(f.read()), ))
db.commit()
cur.execute('select b from t limit 1')
b = cur.fetchone()[0]
with open('00.bin', 'wb') as f:
f.write(b)
db.close()
|
找email |
|
关于 PYthon 抓取网页的问题 |
rawstr=r'^[a-z0-9A-Z_\-]{1,}@[a-z0-9A-Z_\-]{1,}\.[a-z0-9A-Z_\-.]{1,}$'
pattern=re.compile(pattern)
emails=pattern.findall(字符串)
|
httplib抓网页 |
|
httplib抓取UTF8编码的网页,将内容解码时出错 |
# -*- coding: utf-8 -*-
import codecs
import httplib
import sys
#reload(sys)
#sys.setdefaultencoding('utf8')
#print sys.getdefaultencoding()
conn=httplib.HTTPConnection('www.douban.com',80)
conn.request('GET', '/')
resp = conn.getresponse()
f = codecs.open('C:\\tmp\\web.log', 'w', 'utf8')
f.write(resp.read().decode('utf8'))
f.close()
conn.close()
|
美丽的汤 |
|
Beautiful Soup 抓取网页的问题 |
import urllib2
from BeautifulSoup import BeautifulSoup,Tag
import re
import os
import stat
def analysis(url_str):
page=urllib2.urlopen(url_str)
soup = BeautifulSoup(page,fromEncoding="gb2312")
#print unicode(soup.findAll("table")[4])
kk=soup.html.head.title
return kk
url_str="http://www.163.com"
str_string=analysis(url_str)
print str_string
|
抓中文网页乱码的解释 |
|
刚学python,抓中文网页遇到编码的问题,怎么转换也不行…… |
# -*- coding: utf-8 -*-
import urllib2, htmllib, formatter
class LinksExtractor(htmllib.HTMLParser):
def __init__(self, formatter):
htmllib.HTMLParser.__init__(self, formatter)
self.links = []
self.archtexts = []
self.in_anchor = 0
def start_a(self, attrs):
# process the attributes
self.in_anchor = 1;
if len(attrs) > 0 :
for attr in attrs :
if attr[0] == "href" :
self.links.append(attr[1])
def end_a(self):
self.in_anchor = 0
def handle_data(self, text):
if self.in_anchor:
text = text
self.archtexts.append(text)
def get_links(self) :
return self.links
#get html source
request = urllib2.Request('http://www.baidu.com/')
#request = urllib2.Request('http://localhost:8080/')
request.add_header('User-Agent', 'Mozilla/5.0')
opener = urllib2.build_opener()
htmlSource = opener.open(request).read()
format = formatter.NullFormatter()
htmlparser = LinksExtractor(format)
htmlparser.feed(htmlSource)
htmlparser.close()
links = htmlparser.get_links()
for i in range(len(htmlparser.links)):
temp = htmlparser.archtexts[i]
print "url: %s, text: %s" % (htmlparser.links[i], temp)
|
通过pyODBC 连接mssql 配置后无法连接 |
|
通过pyODBC 连接mssql 配置后无法连接 |
import urllib2
from BeautifulSoup import BeautifulSoup,Tag
import re
import os
import stat
#import MySQLdb
import pyodbc
source = 'DRIVER={SQL Server};SERVER=.;DATABASE=test;UID=sa;PWD=123456'
db = pyodbc.connect(source)
cursor = db.cursor()
cursor.execute('select url,term from html_url where term="103"')
for i in cursor.fetchall():
url_str=str(i[0])
print url_str
|