`
潇湘竹溪
  • 浏览: 446 次
  • 性别: Icon_minigender_1
  • 来自: 广州
最近访客 更多访客>>
文章分类
社区版块
存档分类
最新评论
收藏列表
标题 标签 来源
python 正则表达式匹配中文utf8 python 正则表达式匹配中文utf8
>>> import re
>>> x = u'中国'.encode('utf8')
>>> s = x.__repr__()
>>> s
"'\\xc3\\x96\\xc3\\x90\\xc2\\xb9\\xc3\\xba'"
>>> pattern = '.*%s.*' % s[1:-1]
>>> pattern
'.*\\xc3\\x96\\xc3\\x90\\xc2\\xb9\\xc3\\xba.*'
>>> r = re.match(pattern,u'我是中国人'.encode('utf8'))
>>> r
<_sre.SRE_Match object at 0x012A52C0>
>>> dir(r)
['__copy__', '__deepcopy__', 'end', 'expand', 'group', 'groupdict', 'groups', 'span', 'start']
>>> r.group()
'\xc3\x8e\xc3\x92\xc3\x8a\xc3\x87\xc3\x96\xc3\x90\xc2\xb9\xc3\xba\xc3\x88\xc3\x8b'
>>> 
新闻抓取爬虫 PPT 新闻抓取爬虫 PPT
#encoding = utf8
import re 
import sys,os
import re

chars = []
def generate_chars(strList):
    """
    创建字典
    """
    for sr in strList:
        for char in sr:
            if char not in chars:
                chars.append(char)


def vector(sr):
    """
    将字符串转换为向量空间
    """
    vec = []
    for char in chars:
        if sr.find(char) !=-1:
            vec.append(1)
        else:
            vec.append(0)
    return vec

def intersaction(str1,str2):
    """
    计算交集
    """
    v1 =vector(str1)
    v2= vector(str2)
    rv = 0
    for i in xrange(len(v1)):
        rv += v1[i] * v2[i]
    return rv

def detect_repeat(strList):
    """
    检测字符串字符匹配
    """
    chars = generate_chars(strList)
    
    #临界值
    critical = 3
    results = []
    for st in strList:
        if all(intersaction(st,rt) < critical  for rt in results):
            results.append(st)
    
    #返回检测结果
    return results


if __name__ =="__main__":
    strLists = [u"大家好啊",u"大家好",u"张靓颖新专辑",u"张靓颖专辑"]
    results = detect_repeat(strLists)
    print "result " + " > " * 20
    for ru in results:
        print ru.encode("utf-8")


Python保存二进制数据到sqlite3 Python保存二进制数据到sqlite3
# coding: utf8

# Python2.6.2

import sqlite3

db = sqlite3.connect('test.db')
cur = db.cursor()

cur.execute("CREATE TABLE if not exists t (b BLOB);")

with open('0.bin', 'rb') as f:
    cur.execute("insert into t values(?)", (sqlite3.Binary(f.read()), ))
    db.commit()

cur.execute('select b from t limit 1')
b = cur.fetchone()[0]

with open('00.bin', 'wb') as f:
    f.write(b)

db.close()
找email 关于 PYthon 抓取网页的问题
rawstr=r'^[a-z0-9A-Z_\-]{1,}@[a-z0-9A-Z_\-]{1,}\.[a-z0-9A-Z_\-.]{1,}$'
pattern=re.compile(pattern)
emails=pattern.findall(字符串)
httplib抓网页 httplib抓取UTF8编码的网页,将内容解码时出错
# -*- coding: utf-8 -*-
import codecs
import httplib
import sys

#reload(sys)
#sys.setdefaultencoding('utf8')
#print sys.getdefaultencoding()

conn=httplib.HTTPConnection('www.douban.com',80)
conn.request('GET', '/')
resp = conn.getresponse()

f = codecs.open('C:\\tmp\\web.log', 'w', 'utf8')
f.write(resp.read().decode('utf8'))
f.close()
conn.close()
美丽的汤 Beautiful Soup 抓取网页的问题
import urllib2
from BeautifulSoup import BeautifulSoup,Tag
import re
import os
import stat	
def analysis(url_str):
	page=urllib2.urlopen(url_str)
	soup = BeautifulSoup(page,fromEncoding="gb2312")
	#print unicode(soup.findAll("table")[4])
	kk=soup.html.head.title
	return kk

	
url_str="http://www.163.com"
str_string=analysis(url_str)
print str_string
抓中文网页乱码的解释 刚学python,抓中文网页遇到编码的问题,怎么转换也不行……
# -*- coding: utf-8 -*-


import urllib2, htmllib, formatter

class LinksExtractor(htmllib.HTMLParser):

	def __init__(self, formatter):
		htmllib.HTMLParser.__init__(self, formatter)
		self.links = []
		self.archtexts = []
		self.in_anchor = 0
	
	def start_a(self, attrs):
		# process the attributes
		self.in_anchor = 1;
		if len(attrs) > 0 :
			for attr in attrs :
				if attr[0] == "href" : 
					self.links.append(attr[1]) 
	
	def end_a(self):
		self.in_anchor = 0
		
	def handle_data(self, text):
		if self.in_anchor:
			text = text
			self.archtexts.append(text)
	
	def get_links(self) : 
		return self.links

#get html source
request = urllib2.Request('http://www.baidu.com/')
#request = urllib2.Request('http://localhost:8080/')
request.add_header('User-Agent', 'Mozilla/5.0')
opener = urllib2.build_opener()
htmlSource = opener.open(request).read()


format = formatter.NullFormatter()          
htmlparser = LinksExtractor(format)        

htmlparser.feed(htmlSource)      
htmlparser.close()

links = htmlparser.get_links()  
for i in range(len(htmlparser.links)):
	temp = htmlparser.archtexts[i]
	print "url: %s, text: %s" % (htmlparser.links[i], temp)
通过pyODBC 连接mssql 配置后无法连接 通过pyODBC 连接mssql 配置后无法连接
import urllib2
from BeautifulSoup import BeautifulSoup,Tag
import re
import os
import stat
#import MySQLdb
import pyodbc

source = 'DRIVER={SQL Server};SERVER=.;DATABASE=test;UID=sa;PWD=123456'
db = pyodbc.connect(source)

cursor = db.cursor()

cursor.execute('select url,term from html_url where term="103"')

for i in cursor.fetchall():
	url_str=str(i[0])
	print url_str

Global site tag (gtag.js) - Google Analytics