python正则表达式最详解

一、正则表达式–元字符

re 模块使 python 语言拥有全部的正则表达式功能

1. 数量词

# 提取大小写字母混合的单词
import re
a = 'excel 12345word23456ppt12lr'
r = re.findall('[a-za-z]{3,5}',a)
# 提取字母的数量3个到5个
print(r)
# ['excel', 'word', 'ppt']
# 贪婪 与 非贪婪  【python默认使用贪婪模式】
# 贪婪：'[a-za-z]{3,5}'
# 非贪婪：'[a-za-z]{3,5}?' 或 '[a-za-z]{3}'
# 建议使用后者，不要使用?号，否则你会与下面的?号混淆
# 匹配0次或无限多次 *号，*号前面的字符出现0次或无限次
import re
a = 'exce0excell3excel3'
r = re.findall('excel*',a)
r = re.findall('excel.*',a) # ['excell3excel3']
# excel 没有l 有很多l都可以匹配出来
print(r)
# ['exce', 'excell', 'excel']
# 匹配1次或者无限多次 +号，+号前面的字符至少出现1次
import re
a = 'exce0excell3excel3'
r = re.findall('excel+',a)
print(r)
# ['excell', 'excel']
# 匹配0次或1次  ?号，?号经常用来去重复
import re
a = 'exce0excell3excel3'
r = re.findall('excel?',a)
print(r)
# ['exce', 'excel', 'excel']

2. 字符匹配

line = 'xyz,xcz.xfc.xdz,xaz,xez,xec'
r = re.findall('x[de]z', line)
# pattern 是x开始，z结束，含d或e
print(r)
# ['xdz', 'xez']
r = re.findall('x[^de]z', line)
# pattern 是x开始，z结束，不是含d或e
print(r)
# ['xyz', 'xcz', 'xaz']

# \w 可以提取中文，英文，数字和下划线，不能提取特殊字符
import re
a = 'excel 12345word\n23456_ppt12lr'
r = re.findall('\w',a)
print(r)
# ['e', 'x', 'c', 'e', 'l', '1', '2', '3', '4', '5', 'w', 'o', 'r', 'd', '2', '3', '4', '5', '6', '_', 'p', 'p', 't', '1', '2', 'l', 'r']
# \w 提取特殊字符，空格 \n \t
import re
a = 'excel 12345word\n23456_ppt12lr'
r = re.findall('\w',a)
print(r)
# [' ', '\n']

3. 边界匹配

# 限制电话号码的位置必需是8-11位才能提取
import re
tel = '13811115888'
r = re.findall('^\d{8,11}$',tel)
print(r)
# ['13811115888']

4. 组

# 将abc打成一个组，{2}指的是重复几次，匹配abcabc
import re
a = 'abcabcabcxyzabcabcxyzabc'
r = re.findall('(abc){2}',a)  # 与
# ['abc', 'abc']
print(r)
r = re.findall('(abc){3}',a)
# ['abc']

5. 匹配模式参数

# findall第三参数 re.i忽略大小写
import re
a = 'abcfbiabcciaabc'
r = re.findall('fbi',a,re.i)
print(r)
# ['fbi']
# 多个模式之间用 | 连接在一起
import re
a = 'abcfbi\nabcciaabc'
r = re.findall('fbi.{1}',a,re.i | re.s)
# 匹配fbi然后匹配任意一个字符包括\n
print(r)
# ['fbi\n']

二、方法

re.findall

匹配出字符串中所有与制定值相关的值
以列表的形式返回
未匹配则返回空列表

import re
re.findall(pattern, string, flags=0)
pattern.findall(string[ , pos[ , endpos]])

import re
line = "111aaabbb222小呼噜奥利奥"
r = re.findall('[0-9]',line)
print(r)
# ['1', '1', '1', '2', '2', '2']

re.match

re.match 尝试从字符串的起始位置匹配一个模式
如果不是起始位置匹配成功的话，match()就返回none。

re.match(pattern, string, flags=0)
# (标准，要匹配的，标志位)

print(re.match('www','www.xxxx.com'))
print(re.match('www','www.xxxx.com').span())
print(re.match('com','www.xxxx.com'))

<re.match object; span=(0, 3), match='www'>
(0, 3)
none

group匹配对象

import re
a = 'life is short,i use python,i love python'
r = re.search('life(.*)python(.*)python',a)
print(r.group(0))       # 完整正则匹配 ，life is short,i use python,i love python
print(r.group(1))       # 第1个分组之间的取值 is short,i use 
print(r.group(2))       # 第2个分组之间的取值 ,i love 
print(r.group(0,1,2)) # 以元组形式返回3个结果取值 ('life is short,i use python,i love python', ' is short,i use ', ',i love ')
print(r.groups())       # 返回就是group(1)和group(2) (' is short,i use ', ',i love ')

import re
# .*        表示任意匹配除换行符（\n、\r）之外的任何单个或多个字符
# (.*?)     表示"非贪婪"模式，只保存第一个匹配到的子串
# re.m      多行匹配，影响 ^ 和 $
# re.i      使匹配对大小写不敏感
line = "cats are smarter than dogs"
matchobj1 = re.match(r'(.*) are (.*?) .*', line,  re.m|re.i)
matchobj2 = re.match(r'(.*) smarter (.*?) .*', line,  re.m|re.i)
matchobj3 = re.match(r'(.*) than (.*)', line,  re.m|re.i)
print(matchobj1)
print(matchobj2)
print(matchobj3)
# <re.match object; span=(0, 26), match='cats are smarter than dogs'>
# <re.match object; span=(0, 26), match='cats are smarter than dogs'>
# none
if matchobj1:
   print ("matchobj1.group() : ", matchobj1.group())
   print ("matchobj1.group(1) : ", matchobj1.group(1))
   print ("matchobj1.group(2) : ", matchobj1.group(2))
else:
   print ("no match!!")
if matchobj2:
   print ("matchobj2.group() : ", matchobj2.group())
   print ("matchobj2.group(1) : ", matchobj2.group(1))
   print ("matchobj2.group(2) : ", matchobj2.group(2))
else:
   print ("no match!!")
if matchobj3:
   print ("matchobj3.group() : ", matchobj3.group())
   print ("matchobj3.group(1) : ", matchobj3.group(1))
   print ("matchobj3.group(2) : ", matchobj3.group(2))
else:
   print ("no match!!")
# matchobj1.group() :  cats are smarter than dogs
# matchobj1.group(1) :  cats
# matchobj1.group(2) :  smarter
# matchobj2.group() :  cats are smarter than dogs
# matchobj2.group(1) :  cats are
# matchobj2.group(2) :  than
# matchobj3.group() :  cats are smarter than dogs
# matchobj3.group(1) :  cats are smarter
# matchobj3.group(2) :  dogs

import re
# 点 是匹配单个字符
# 星是前面的东西出现0次或无数次
# 点星就是任意字符出现0次或无数次
str = "a b a b"
matchobj1 = re.match(r'a(.*)b', str,  re.m|re.i)
matchobj2 = re.match(r'a(.*?)b', str,  re.m|re.i)
print("matchobj1.group() : ", matchobj1.group())
print("matchobj2.group() : ", matchobj2.group())
# matchobj1.group() :  a b a b
# matchobj2.group() :  a b

re.search

扫描整个字符串并返回第一个成功的匹配。

re.search(pattern, string, flags=0)

import  re
line = "cats are smarter than dogs"
matchobj = re.match(r'dogs',line,re.m|re.i)
matchobj1= re.search(r'dogs',line,re.m|re.i)
matchobj2= re.match(r'(.*) dogs',line,re.m|re.i)
if matchobj:
   print ("match --> matchobj.group() : ", matchobj.group())
else:
   print ("no match!!")
if matchobj1:
   print ("match --> matchobj1.group() : ", matchobj1.group())
else:
   print ("no match!!")
if matchobj2:
   print ("match --> matchobj2.group() : ", matchobj2.group())
else:
   print ("no match!!")
# no match!!
# match --> matchobj1.group() :  dogs
# match --> matchobj2.group() :  cats are smarter than dogs

re.compile

re.compile是将正则表达式转换为模式对象
这样可以更有效率匹配。使用compile转换一次之后，以后每次使用模式时就不用进行转换

三、检索和替换

re.sub 替换字符串

re.sub('被替换的','替换成的',a)

# 把fbi替换成bbq
import re
a = 'abcfbiabcciaabc'
r = re.sub('fbi','bbq',a)
print(r)
# 把fbi替换成bbq，第4参数写1，证明只替换第一次，默认是0（无限替换）
import re
a = 'abcfbiabcfbiafbiciaabc'
r = re.sub('fbi','bbq',a,1)
print(r)
# abcbbqabcciaabc
# abcbbqabcfbiafbiciaabc

# 把函数当参数传到sub的列表里，实现把业务交给函数去处理，例如将fbi替换成$fbi$
import re
a = 'abcfbiabcfbiafbiciaabc'
def 函数名(形参):
    分段获取 = 形参.group()           # group（）在正则表达式中用于获取分段截获的字符串，获取到fbi
    return '$' + 分段获取 + '$'
r = re.sub('fbi',函数名,a)
print(r)

总结

本篇文章就到这里了，希望能够给你带来帮助，也希望您能够多多关注www.887551.com的更多内容！

黄山市民网：https://www.huangshanshimin.com/

目录

一、正则表达式–元字符

1. 数量词

2. 字符匹配

3. 边界匹配

4. 组

5. 匹配模式参数

二、方法

re.findall

re.match

group匹配对象

re.search

re.compile

三、检索和替换

re.sub 替换字符串

总结

相关文章