跳到主要内容

正则表达式

正则表达式(Regular Expression)是一种强大的文本处理工具,用于匹配、查找、替换和分割字符串。Python 通过 re 模块提供正则表达式支持。

re 模块基础

导入模块

import re

# 原始字符串:避免转义字符的干扰
pattern = r'\d+' # 推荐:使用原始字符串
pattern2 = '\\d+' # 不推荐:需要双重转义

核心方法对比

import re

text = "Python 3.9, Python 3.10, Python 3.11"

# 1. re.match():只从字符串开头匹配
result = re.match(r'Python', text)
print(result.group()) # Python
print(result.span()) # (0, 6)

result = re.match(r'3\.9', text)
print(result) # None:不匹配开头

# 2. re.search():扫描整个字符串,找到第一个匹配
result = re.search(r'3\.\d+', text)
print(result.group()) # 3.9

# 3. re.findall():返回所有非重叠匹配
results = re.findall(r'3\.\d+', text)
print(results) # ['3.9', '3.10', '3.11']

# 4. re.finditer():返回所有匹配的迭代器
for match in re.finditer(r'3\.\d+', text):
print(f"Found: {match.group()} at {match.span()}")

# 5. re.sub():替换匹配项
new_text = re.sub(r'3\.\d+', '4.0', text)
print(new_text) # Python 4.0, Python 4.0, Python 4.0

# 6. re.split():根据匹配分割字符串
parts = re.split(r',\s*', text)
print(parts) # ['Python 3.9', 'Python 3.10', 'Python 3.11']

编译正则表达式

import re

# 编译:提高重复使用效率
pattern = re.compile(r'\b\w{4}\b') # 匹配四个字母的单词
text = "This is a test code"

# 使用编译后的模式
matches = pattern.findall(text)
print(matches) # ['This', 'test', 'code']

# 编译时指定标志
pattern = re.compile(r'python', re.IGNORECASE)
print(pattern.findall("I love Python and python")) # ['Python', 'python']

元字符

特殊字符列表

import re

"""
元字符列表:
. 匹配任意字符(除换行符)
^ 匹配字符串开头
$ 匹配字符串结尾
* 重复0次或多次
+ 重复1次或多次
? 重复0次或1次
{ } 自定义重复次数
[ ] 字符集
\ 转义字符
| 或运算
( ) 分组
"""

# 转义元字符:匹配特殊字符本身
text = "Price: $100, Discount: 20%"
pattern = r'\$\d+' # 匹配美元符号
print(re.findall(pattern, text)) # ['$100']

# 点号.:匹配任意字符
text = "cat, cot, cut, c_t, c\nt"
print(re.findall(r'c.t', text)) # ['cat', 'cot', 'cut', 'c_t']
# 点号不匹配换行符
print(re.findall(r'c.t', text, re.DOTALL)) # ['cat', 'cot', 'cut', 'c_t', 'c\nt']

字符匹配

预定义字符类

import re

text = "Python 3.11 released in 2023, price $99.99!"

"""
\d:匹配数字 [0-9]
\D:匹配非数字 [^0-9]
\s:匹配空白 [ \t\n\r\f\v]
\S:匹配非空白
\w:匹配单词字符 [a-zA-Z0-9_]
\W:匹配非单词字符
"""

# 匹配所有数字
numbers = re.findall(r'\d+', text)
print(numbers) # ['3', '11', '2023', '99', '99']

# 匹配单词
words = re.findall(r'\w+', text)
print(words) # ['Python', '3', '11', 'released', 'in', '2023', 'price', '99', '99']

# 匹配空白分隔的单词
words = re.findall(r'\S+', text)
print(words) # ['Python', '3.11', 'released', 'in', '2023,', 'price', '$99.99!']

字符集

import re

"""
[abc]:匹配a、b或c
[^abc]:匹配除a、b、c之外的字符
[a-z]:匹配a到z的小写字母
[A-Z]:匹配A到Z的大写字母
[0-9]:匹配数字
[a-zA-Z]:匹配所有字母
[a-zA-Z0-9]:等价于\w
"""

# 匹配元音字母
text = "Hello World"
vowels = re.findall(r'[aeiouAEIOU]', text)
print(vowels) # ['e', 'o', 'o']

# 匹配非元音字母
consonants = re.findall(r'[^aeiouAEIOU\W]', text)
print(consonants) # ['H', 'l', 'l', 'W', 'r', 'l', 'd']

# 匹配大写字母
uppercase = re.findall(r'[A-Z]', text)
print(uppercase) # ['H', 'W']

# 匹配十六进制数
hex_text = "0x1A 0xFF 0x3C"
hex_numbers = re.findall(r'0x[0-9A-Fa-f]+', hex_text)
print(hex_numbers) # ['0x1A', '0xFF', '0x3C']

重复匹配

量词

import re

"""
*:重复0次或多次 {0,}
+:重复1次或多次 {1,}
?:重复0次或1次 {0,1}
{n}:重复n次
{n,}:重复至少n次
{n,m}:重复n到m次
"""

text = "a, aa, aaa, aaaa, b"

# *:0次或多次
print(re.findall(r'a*', text)) # 匹配所有a和空字符串

# +:1次或多次
print(re.findall(r'a+', text)) # ['a', 'aa', 'aaa', 'aaaa']

# ?:0次或1次
text2 = "color, colour"
print(re.findall(r'colou?r', text2)) # ['color', 'colour']

# {n}:精确n次
text3 = "123 4567 89"
print(re.findall(r'\d{3}', text3)) # ['123', '456']

# {n,}:至少n次
print(re.findall(r'\d{3,}', text3)) # ['123', '4567']

# {n,m}:n到m次
print(re.findall(r'\d{3,4}', text3)) # ['123', '4567']

实际应用

import re

# 匹配IP地址
ip = "192.168.1.1 10.0.0.1 255.255.255.0"
pattern = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
print(re.findall(pattern, ip)) # ['192.168.1.1', '10.0.0.1', '255.255.255.0']

# 匹配日期(YYYY-MM-DD)
date_text = "2023-01-15, 1999-12-31, 2023-1-1"
pattern = r'\d{4}-\d{2}-\d{2}'
print(re.findall(pattern, date_text)) # ['2023-01-15', '1999-12-31']

# 匹配时间(HH:MM:SS)
time_text = "23:59:59, 00:00:00, 9:30:00"
pattern = r'\d{2}:\d{2}:\d{2}'
print(re.findall(pattern, time_text)) # ['23:59:59', '00:00:00']

位置匹配

锚点

import re

"""
^:匹配字符串开头
$:匹配字符串结尾
\b:匹配单词边界
\B:匹配非单词边界
\A:匹配字符串开头(不受多行影响)
\Z:匹配字符串结尾(不受多行影响)
"""

text = "Python is awesome. I love Python"

# ^:开头
print(re.findall(r'^Python', text)) # ['Python']
print(re.findall(r'^Python', text, re.MULTILINE)) # ['Python']

# $:结尾
print(re.findall(r'Python$', text)) # ['Python']

# \b:单词边界
print(re.findall(r'\bPython\b', text)) # ['Python', 'Python']
print(re.findall(r'\bPyt', text)) # ['Pyt', 'Pyt']

# \B:非单词边界
text2 = "PythonPython"
print(re.findall(r'Python\B', text2)) # ['Python']

# 多行模式
multiline = """Line 1: Python
Line 2: Java
Line 3: Python"""

# 在多行模式下,^和$匹配每行的开头和结尾
print(re.findall(r'^Line \d+', multiline, re.MULTILINE))
# ['Line 1', 'Line 2', 'Line 3']

分组与捕获

捕获分组

import re

# ( ):捕获分组
text = "John: 30 years, Jane: 25 years, Bob: 35 years"

# 提取名字和年龄
pattern = r'(\w+):\s*(\d+)\s*years'
matches = re.findall(pattern, text)
print(matches)
# [('John', '30'), ('Jane', '25'), ('Bob', '35')]

# 使用finditer获取详细信息
for match in re.finditer(pattern, text):
name = match.group(1)
age = match.group(2)
print(f"{name} is {age} years old")

# 嵌套分组
text = "2023-12-25"
pattern = r'((\d{4})-(\d{2})-(\d{2}))'
match = re.search(pattern, text)

print(match.group(0)) # 2023-12-25 (完整匹配)
print(match.group(1)) # 2023-12-25 (第一组)
print(match.group(2)) # 2023 (第二组)
print(match.group(3)) # 12 (第三组)
print(match.group(4)) # 25 (第四组)

# groups():返回所有分组
print(match.groups()) # ('2023-12-25', '2023', '12', '25')

命名分组

import re

# (?P<name>pattern):命名捕获组
text = "user@example.com, admin@test.org"

pattern = r'(?P<username>\w+)@(?P<domain>[\w.]+)'
for match in re.finditer(pattern, text):
print(f"Username: {match.group('username')}")
print(f"Domain: {match.group('domain')}")

# groupdict():返回命名分组的字典
match = re.search(pattern, text)
print(match.groupdict())
# {'username': 'user', 'domain': 'example.com'}

# 实际应用:解析日志
log = "2023-12-25 10:30:45 [ERROR] File not found: config.ini"
pattern = r'(?P<date>\d{4}-\d{2}-\d{2}) (?P<time>\d{2}:\d{2}:\d{2}) \[(?P<level>\w+)\] (?P<message>.*)'
match = re.search(pattern, log)

if match:
print(f"Level: {match.group('level')}")
print(f"Message: {match.group('message')}")

非捕获分组

import re

# (?:pattern):非捕获分组,只用于分组但不捕获
text = "Python, Java, C++, JavaScript"

# 匹配"语言, "或"语言 and "
pattern = r'\w+(?:, | and )'
print(re.findall(pattern, text))
# ['Python, ', 'Java, ', 'C++ and ']

# 对比捕获分组和非捕获分组
pattern1 = r'(?:\w+), (\w+)' # 第一组不捕获
pattern2 = r'(\w+), (\w+)' # 两组都捕获

match1 = re.search(pattern1, text)
match2 = re.search(pattern2, text)

print(match1.groups()) # ('Java',) # 只捕获第二组
print(match2.groups()) # ('Python', 'Java') # 捕获两组

反向引用

import re

# \n:引用第n个捕获组
text = "hello hello, world world, test test"

# 匹配重复的单词
pattern = r'(\w+) \1'
print(re.findall(pattern, text)) # ['hello', 'world', 'test']

# 匹配HTML标签(简化版)
html = "<div>Content</div><p>Paragraph</p>"
pattern = r'<(\w+)>.*?</\1>'
print(re.findall(pattern, html)) # ['div', 'p']

# 命名分组的反向引用
text = "start-start, end-end, middle-middle"
pattern = r'(?P<word>\w+)-(?P=word)'
print(re.findall(pattern, text)) # ['start', 'end', 'middle']

贪婪与非贪婪

贪婪匹配(默认)

import re

text = "<div>Content 1</div><div>Content 2</div>"

# 贪婪:*、+、{n,m}默认尽可能多匹配
pattern_greedy = r'<div>.*</div>'
match = re.search(pattern_greedy, text)
print(match.group())
# <div>Content 1</div><div>Content 2</div>

非贪婪匹配

import re

text = "<div>Content 1</div><div>Content 2</div>"

# 非贪婪:*?、+?、??、{n,m}?尽可能少匹配
pattern_lazy = r'<div>.*?</div>'
matches = re.findall(pattern_lazy, text)
print(matches)
# ['<div>Content 1</div>', '<div>Content 2</div>']

# 对比例子
text2 = "a<b>c</b>d<b>e</b>f"

# 贪婪:匹配到最后一个</b>
print(re.search(r'<b>.*</b>', text2).group())
# <b>c</b>d<b>e</b>

# 非贪婪:匹配到第一个</b>
print(re.search(r'<b>.*?</b>', text2).group())
# <b>c</b>

# 实际应用:提取引号内容
text3 = 'He said "Hello" and she replied "Hi there"'
quotes = re.findall(r'"(.*?)"', text3)
print(quotes) # ['Hello', 'Hi there']

量词对比

import re

text = "a, ab, abb, abbb"

# *:尽可能多
print(re.findall(r'ab*', text)) # ['a', 'ab', 'abb', 'abbb']

# *?:尽可能少
print(re.findall(r'ab*?', text)) # ['a', 'a', 'a', 'a']

# +:尽可能多
print(re.findall(r'ab+', text)) # ['ab', 'abb', 'abbb']

# +?:尽可能少
print(re.findall(r'ab+?', text)) # ['ab', 'ab', 'ab']

# HTML示例
html = '<b>Bold</b> and <i>Italic</i>'

# 贪婪
print(re.findall(r'<.*>', html))
# ['<b>Bold</b> and <i>Italic</i>']

# 非贪婪
print(re.findall(r'<.*?>', html))
# ['<b>', '</b>', '<i>', '</i>']

零宽断言

先行断言(Lookahead)

import re

"""
(?=pattern):正向先行断言,后面是pattern
(?!pattern):负向先行断言,后面不是pattern
"""

# 正向先行断言:匹配后面是特定数字的货币
text = "Price: $100, $200, $300"
pattern = r'\$(?=\d{3})' # 匹配后面跟3位数字的$
print(re.findall(pattern, text)) # ['$']

# 匹配三位数,且后面是"USD"
text2 = "100 USD, 200 EUR, 300 USD, 400 GBP"
pattern = r'\d{3}(?= USD)'
print(re.findall(pattern, text2)) # ['100', '300']

# 负向先行断言:匹配后面不是特定字符的内容
text3 = "test123, test456, testabc"
pattern = r'test(?!\d)' # 匹配后面不是数字的test
print(re.findall(pattern, text3)) # ['test'] (testabc中的test)

# 匹配不以.exe结尾的文件
text4 = "file.txt, file.exe, file.pdf, file.doc, file.exe.bak"
pattern = r'\w+\.(?!exe\b)\w+'
print(re.findall(pattern, text4))
# ['file.txt', 'file.pdf', 'file.doc', 'file.exe.bak']

后行断言(Lookbehind)

import re

"""
(?<=pattern):正向后行断言,前面是pattern
(?<!pattern):负向后行断言,前面不是pattern
"""

# 正向后行断言:匹配前面是特定货币符号的数字
text = "$100, €200, $300"
pattern = r'(?<=\$)\d+' # 匹配前面是$的数字
print(re.findall(pattern, text)) # ['100', '300']

# 匹配引号中的内容(不包括引号)
text2 = 'He said "Hello" and "Goodbye"'
pattern = r'(?<=")\w+(?=")'
print(re.findall(pattern, text2)) # ['Hello', 'Goodbye']

# 负向后行断言:匹配前面不是特定字符的内容
text3 = "100USD, 200EUR, 300GBP"
pattern = r'(?<!\$)\d+' # 匹配前面不是$的数字
print(re.findall(pattern, text3)) # ['100', '200', '300']

# 匹配不在协议后的://
text4 = "https://example.com, http://test.com, ftp://files.com, string://test"
pattern = r'(?<!http|ftp)s://'
print(re.findall(pattern, text4)) # ['s://'] (只匹配string://中的s://)

# 实际应用:提取变量值
text5 = "name=John, age=30, city=NYC"
pattern = r'(?<==)\w+(?=,?\s*$)'
print(re.findall(pattern, text5)) # ['NYC']

Match 对象

Match 对象方法

import re

text = "Python 3.11 released on 2023-12-25"
pattern = r'(\w+)\s*(\d+\.\d+)'

match = re.search(pattern, text)

if match:
# group():返回匹配的字符串
print(match.group()) # Python 3.11 (完整匹配)
print(match.group(0)) # Python 3.11 (同上)
print(match.group(1)) # Python (第一组)
print(match.group(2)) # 3.11 (第二组)

# groups():返回所有分组(不包括group(0))
print(match.groups()) # ('Python', '3.11')

# groupdict():返回命名分组的字典
pattern_named = r'(?P<lang>\w+)\s*(?P<ver>\d+\.\d+)'
match_named = re.search(pattern_named, text)
print(match_named.groupdict()) # {'lang': 'Python', 'ver': '3.11'}

# start():返回匹配开始位置
print(match.start()) # 0
print(match.start(1)) # 0 (第一组开始)
print(match.start(2)) # 7 (第二组开始)

# end():返回匹配结束位置
print(match.end()) # 12
print(match.end(1)) # 6 (第一组结束)
print(match.end(2)) # 12 (第二组结束)

# span():返回(start, end)元组
print(match.span()) # (0, 12)
print(match.span(1)) # (0, 6)
print(match.span(2)) # (7, 12)

# string:返回被匹配的字符串
print(match.string) # Python 3.11 released on 2023-12-25

# re:返回匹配时使用的Pattern对象
print(match.re) # re.compile('(\\w+)\\s*(\\d+\\.\\d+)')

实际应用示例

import re

# 验证和提取URL
url = "https://www.example.com/page?id=123&name=test"
pattern = r'(?P<protocol>https?)://(?P<domain>[\w.]+)(?P<path>/[\w/?&=]*)?'

match = re.search(pattern, url)
if match:
print(f"Protocol: {match.group('protocol')}")
print(f"Domain: {match.group('domain')}")
print(f"Path: {match.group('path') or '/'}")

# 解析命令行参数
command = "copy file1.txt file2.txt /force /verbose"
pattern = r'(\w+)(?:\s+(\S+))*'
match = re.match(pattern, command)

if match:
cmd = match.group(1)
args = re.findall(r'/(\w+)', command)
files = re.findall(r'(\S+\.txt)', command)
print(f"Command: {cmd}")
print(f"Files: {files}")
print(f"Options: {args}")

编译标志

常用标志

import re

"""
re.IGNORECASE / re.I:忽略大小写
re.MULTILINE / re.M:多行模式(^$匹配每行)
re.DOTALL / re.S:点号匹配换行符
re.VERBOSE / re.X:详细模式,允许注释和空白
re.ASCII / re.A:ASCII模式,只匹配ASCII字符
re.UNICODE / re.U:Unicode模式(默认)
re.LOCALE / re.L:本地化模式(已弃用)
"""

# re.IGNORECASE:忽略大小写
text = "Python python PYTHON PyThOn"
pattern = re.compile(r'python', re.IGNORECASE)
print(pattern.findall(text)) # ['Python', 'python', 'PYTHON', 'PyThOn']

# re.MULTILINE:多行模式
text = """Line 1: Python
Line 2: Java
Line 3: Python"""

print(re.findall(r'^Line \d+', text, re.MULTILINE))
# ['Line 1', 'Line 2', 'Line 3']

# re.DOTALL:点号匹配换行符
html = """<div>
Content
</div>"""

print(re.search(r'<div>.*</div>', html, re.DOTALL).group())
# <div>
# Content
# </div>

# re.VERBOSE:详细模式,允许注释和换行
pattern = re.compile(r'''
\b # 单词边界
\w+ # 用户名
@ # @符号
[\w.]+ # 域名
\. # 点号
[a-z]{2,} # 顶级域名
\b # 单词边界
''', re.VERBOSE | re.IGNORECASE)

email = "user@example.com"
print(pattern.match(email)) # <re.Match object; span=(0, 16), match='user@example.com'>

# 组合多个标志
text = """Python
python
PYTHON
Java"""

pattern = re.compile(r'^python', re.MULTILINE | re.IGNORECASE)
print(pattern.findall(text)) # ['Python', 'python', 'PYTHON']

常用模式

邮箱验证

import re

def validate_email(email):
"""验证邮箱地址"""
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
return bool(re.match(pattern, email))

# 测试
emails = [
"user@example.com",
"user.name@example.com",
"user+tag@example.co.uk",
"invalid@",
"@example.com",
"user@.com"
]

for email in emails:
print(f"{email:30} -> {validate_email(email)}")

# 提取邮箱
text = "Contact us at info@example.com or support@test.org for help"
emails = re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text)
print(emails) # ['info@example.com', 'support@test.org']

手机号验证

import re

def validate_phone_cn(phone):
"""验证中国手机号"""
pattern = r'^1[3-9]\d{9}$'
return bool(re.match(pattern, phone))

# 测试
phones = [
"13812345678", # 有效
"15912345678", # 有效
"12345678901", # 无效(第二位不是3-9)
"1381234567", # 无效(位数不够)
"138123456789", # 无效(位数太多)
]

for phone in phones:
print(f"{phone:15} -> {validate_phone_cn(phone)}")

# 脱敏手机号
text = "客服电话:13812345678,投诉电话:15912345678"
pattern = r'(\d{3})\d{4}(\d{4})'
masked = re.sub(pattern, r'\1****\2', text)
print(masked) # 客服电话:138****5678,投诉电话:159****5678

URL 匹配

import re

def extract_urls(text):
"""提取URL"""
pattern = r'https?://[\w\-]+(\.[\w\-]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?'
return re.findall(pattern, text)

text = """
访问我们的网站 https://www.example.com,
文档地址 https://docs.example.com/guide/tutorial.html,
或FTP站点 ftp://files.example.com/downloads
"""

urls = re.findall(r'https?://[\w\-]+(\.[\w\-]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?', text)
print(urls)

# 更精确的URL匹配
pattern = r'(?P<url>https?://(?:[\w-]+\.)+[\w-]+(?:/[\w\-./?%&=]*)?)'
matches = re.finditer(pattern, text)

for match in matches:
print(match.group('url'))

IP 地址匹配

import re

def validate_ip(ip):
"""验证IPv4地址"""
pattern = r'^((25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)\.){3}(25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)$'
return bool(re.match(pattern, ip))

# 测试
ips = [
"192.168.1.1", # 有效
"255.255.255.255", # 有效
"256.1.1.1", # 无效(超过255)
"192.168.1", # 无效(缺少一段)
"192.168.1.1.1", # 无效(多了一段)
]

for ip in ips:
print(f"{ip:20} -> {validate_ip(ip)}")

# 提取IP地址
log = "User connected from 192.168.1.100 at 10:30, admin from 10.0.0.1 at 11:45"
ips = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', log)
print(ips) # ['192.168.1.100', '10.0.0.1']

日期时间匹配

import re

# 匹配日期(YYYY-MM-DD)
text = "今天是 2023-12-25, 明天是 2023-12-26"
dates = re.findall(r'\d{4}-\d{2}-\d{2}', text)
print(dates) # ['2023-12-25', '2023-12-26']

# 验证日期格式
def validate_date(date_str):
"""验证日期格式 YYYY-MM-DD"""
pattern = r'^(?P<year>\d{4})-(?P<month>0[1-9]|1[0-2])-(?P<day>0[1-9]|[12]\d|3[01])$'
match = re.match(pattern, date_str)
if match:
return match.groupdict()
return None

print(validate_date("2023-12-25"))
# {'year': '2023', 'month': '12', 'day': '25'}

print(validate_date("2023-13-45")) # None

# 匹配时间 HH:MM:SS
text = "开始时间: 09:30:00, 结束时间: 17:30:00"
times = re.findall(r'\d{2}:\d{2}:\d{2}', text)
print(times) # ['09:30:00', '17:30:00']

# 匹配日期时间
datetime_text = "2023-12-25 14:30:45"
pattern = r'(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2})'
match = re.match(pattern, datetime_text)

if match:
date, time = match.groups()
print(f"Date: {date}, Time: {time}")

HTML/XML 标签

import re

# 提取HTML标签内容
html = "<div>Hello</div><p>World</p><span>Test</span>"

# 提取标签内容(非贪婪)
pattern = r'<(div|p|span)>(.*?)</\1>'
matches = re.findall(pattern, html)

for tag, content in matches:
print(f"{tag}: {content}")

# 提取所有标签
tags = re.findall(r'<(\w+)', html)
print(tags) # ['div', 'p', 'span']

# 提取属性
html2 = '<a href="https://example.com" class="link" target="_blank">Link</a>'

# 提取href属性
href = re.search(r'href="([^"]*)"', html2)
if href:
print(f"href: {href.group(1)}") # href: https://example.com

# 提取所有属性
attrs = re.findall(r'(\w+)="([^"]*)"', html2)
print(attrs)
# [('href', 'https://example.com'), ('class', 'link'), ('target', '_blank')]

清理和规范化文本

import re

# 移除多余空白
text = "Hello World This is a Test"
cleaned = re.sub(r'\s+', ' ', text)
print(cleaned) # Hello World This is a Test

# 移除特殊字符
text2 = "Hello, World! @#$%^&*()"
cleaned = re.sub(r'[^\w\s]', '', text2)
print(cleaned) # Hello World

# 标准化空格和标点
text3 = "Hello , world ! This is a test ."
pattern = r'\s+([,.!?])'
cleaned = re.sub(pattern, r'\1', text3)
print(cleaned) # Hello, world! This is a test.

# 提取数字
text4 = "Price: $99.99, Discount: 20%, Quantity: 5"
numbers = re.findall(r'\d+\.?\d*', text4)
print(numbers) # ['99.99', '20', '5']

# 移除HTML标签
html = "<div><p>Hello <b>World</b></p></div>"
text_only = re.sub(r'<[^>]+>', '', html)
print(text_only) # Hello World

高级技巧

条件匹配

import re

# 使用(?(id/name)yes-pattern|no-pattern)
# 匹配两种格式的日期
text = "2023-12-25 and 12/25/2023"

# 如果以数字开头,匹配 YYYY-MM-DD,否则匹配 MM/DD/YYYY
pattern = r'^(\d)?(?(1)\d{3}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4})'

# 匹配带引号或不带引号的字符串
text2 = '"hello", "world", test'

# 如果有引号,匹配引号内的内容;否则匹配单词
pattern = r'(?:"([^"]*)"|(\w+))'
matches = re.findall(pattern, text2)

for quoted, unquoted in matches:
if quoted:
print(f"Quoted: {quoted}")
elif unquoted:
print(f"Unquoted: {unquoted}")

回溯控制

import re

# (?>atomic):原子分组,防止回溯
text = "aaaaaaaaaaaa"

# 普通匹配(有回溯)
pattern1 = r'(a+)+a'
match1 = re.match(pattern1, text)

# 原子分组(无回溯,性能更好)
pattern2 = r'(?(>a+))+a'
match2 = re.match(pattern2, text)

# 实际应用:匹配HTML标签
html = "<div>content</div>"

# 使用原子组提高性能
pattern = r'<(?!)(?>(?!<\/?\1>).)*<\/\1>'

递归匹配

import re
import regex # 需要安装regex模块: pip install regex

# Python的re模块不支持递归,但regex模块支持
# 匹配括号嵌套
text = "(a(b(c)d)e)"

# 使用regex模块的递归模式
pattern = r'\((?:[^()]|(?R))*\)'
matches = regex.findall(pattern, text)
print(matches) # ['(a(b(c)d)e)']

性能优化

编译正则表达式

import re

# 好的做法:编译后重复使用
pattern = re.compile(r'\b\w{4}\b')

# 在循环中使用
texts = ["word", "test", "code", "python"]
for text in texts:
matches = pattern.findall(text)
print(f"{text}: {matches}")

# 不好的做法:每次都重新编译
for text in texts:
matches = re.findall(r'\b\w{4}\b', text) # 效率低

避免贪婪回溯

import re

text = "a" * 100 + "b"

# 避免过度贪婪
# 不好:可能导致灾难性回溯
pattern1 = re.compile(r'^a+a+$')

# 更好的方式
pattern2 = re.compile(r'^a+b$')

# 对于HTML等结构化数据,使用专用解析器
import html.parser
# 而不是正则表达式

使用非捕获组

import re

text = "Python, Java, C++, JavaScript"

# 当不需要捕获分组时,使用非捕获组
pattern = r'(?:\w+),\s*(?:\w+)'
match = re.search(pattern, text)

# 对比
pattern_with_capture = r'(\w+),\s*(\w+)'
match_with_capture = re.search(pattern_with_capture, text)

# 非捕获组性能更好,因为不需要保存分组信息

实战案例

日志分析

import re

# 解析Apache访问日志
log_line = '127.0.0.1 - - [25/Dec/2023:10:30:45 +0800] "GET /index.html HTTP/1.1" 200 2326'

pattern = r'''
(?P<ip>\d+\.\d+\.\d+\.\d+) # IP地址
\s+-\s+-\s+ # 占位符
\[(?P<timestamp>[^\]]+)\] # 时间戳
\s+"(?P<method>\w+) # 请求方法
\s+(?P<path>[^\s]+) # 请求路径
\s+(?P<protocol>[\w/.]+)" # 协议
\s+(?P<status>\d{3}) # 状态码
\s+(?P<size>\d+) # 响应大小
'''

match = re.match(pattern, log_line, re.VERBOSE)

if match:
data = match.groupdict()
print(f"IP: {data['ip']}")
print(f"Path: {data['path']}")
print(f"Status: {data['status']}")

# 分析日志文件
def analyze_logs(log_file):
errors = []
pattern = re.compile(r'\[([^\]]+)\].*"(5\d{2})\s+\d+')

with open(log_file, 'r') as f:
for line in f:
match = pattern.search(line)
if match:
errors.append({
'timestamp': match.group(1),
'status': match.group(2)
})

return errors

数据清洗

import re

def clean_phone(phone):
"""清洗和标准化电话号码"""
# 移除所有非数字字符
digits = re.sub(r'\D', '', phone)

# 验证长度
if len(digits) == 11 and digits.startswith('1'):
# 格式化为: 1-XXX-XXXX-XXXX
return f"{digits[0]}-{digits[1:4]}-{digits[4:8]}-{digits[8:]}"

return None

phones = [
"138-1234-5678",
"138 1234 5678",
"(138) 1234-5678",
"+86 138 1234 5678"
]

for phone in phones:
cleaned = clean_phone(phone)
print(f"{phone:25} -> {cleaned}")

# 清理用户输入
def clean_user_input(text):
# 移除多余空白
text = re.sub(r'\s+', ' ', text)
# 移除特殊字符
text = re.sub(r'[^\w\s\-@.]', '', text)
# 去除首尾空白
text = text.strip()
return text

user_input = " Hello!!! World@#$ "
print(clean_user_input(user_input)) # Hello World@

文本替换

import re

# 敏感词过滤
text = "这是一个测试文本,包含一些敏感词汇"

# 使用字典替换
replacements = {
'敏感': '***',
'词汇': '**',
}

pattern = re.compile('|'.join(map(re.escape, replacements.keys())))
filtered = pattern.sub(lambda m: replacements[m.group()], text)
print(filtered) # 这是一个测试文本,包含一些***

# 模板替换
template = "Hello {name}, welcome to {place}! Your balance is ${balance}."

data = {
'name': 'Alice',
'place': 'Python',
'balance': '100.50'
}

result = template.format(**data)
print(result) # Hello Alice, welcome to Python! Your balance is $100.50.

# 使用正则替换
pattern = r'\{(\w+)\}'
result2 = re.sub(pattern, lambda m: data.get(m.group(1), m.group()), template)
print(result2)

数据提取

import re

# 从文本中提取结构化数据
text = """
员工信息:
姓名:张三,年龄:25,部门:技术部
姓名:李四,年龄:30,部门:市场部
姓名:王五,年龄:28,部门:技术部
"""

pattern = r'姓名:(?P<name>\w+),年龄:(?P<age>\d+),部门:(?P<dept>\w+)'
employees = re.findall(pattern, text)

print(employees)
# [('张三', '25', '技术部'), ('李四', '30', '市场部'), ('王五', '28', '技术部')]

# 转换为字典列表
employees_list = [
{'name': name, 'age': int(age), 'dept': dept}
for name, age, dept in employees
]

print(employees_list)

# 提取表格数据
table_text = """
| ID | Name | Age |
|----|---------|-----|
| 1 | Alice | 25 |
| 2 | Bob | 30 |
| 3 | Charlie | 35 |
"""

rows = re.findall(r'\|\s*(\d+)\s*\|\s*(\w+)\s*\|\s*(\d+)\s*\|', table_text)

for row_id, name, age in rows:
print(f"ID: {row_id}, Name: {name}, Age: {age}")

最佳实践

使用建议

import re

# 1. 使用原始字符串
pattern = r'\d+' # 推荐
pattern = '\\d+' # 不推荐

# 2. 编译常用的正则表达式
COMMON_PATTERNS = {
'email': re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'),
'phone': re.compile(r'1[3-9]\d{9}'),
'ip': re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'),
}

# 使用编译好的模式
def validate_data(text, type_):
pattern = COMMON_PATTERNS.get(type_)
if pattern:
return bool(pattern.match(text))
return False

# 3. 使用具体量词而非贪婪匹配
# 不好:.*
# 好:.*? 或具体模式

# 4. 使用非捕获组提升性能
# 捕获组: (pattern)
# 非捕获组: (?:pattern)

# 5. 注释复杂的正则表达式
complex_pattern = re.compile(r'''
^ # 字符串开始
[a-zA-Z0-9._%+-]+ # 用户名
@ # @符号
[a-zA-Z0-9.-]+ # 域名
\. # 点号
[a-zA-Z]{2,} # 顶级域名
$ # 字符串结束
''', re.VERBOSE)

# 6. 错误处理
try:
pattern = re.compile(r'(?:invalid') # 不完整的正则
except re.error as e:
print(f"正则表达式错误: {e}")

# 7. 测试正则表达式
def test_pattern(pattern, test_cases):
for test_input, expected in test_cases:
result = bool(pattern.match(test_input))
status = "✓" if result == expected else "✗"
print(f"{status} Input: {test_input:20} Expected: {expected}, Got: {result}")

# 测试邮箱模式
email_pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
test_cases = [
("user@example.com", True),
("invalid@", False),
("@example.com", False),
]

test_pattern(email_pattern, test_cases)

调试技巧

import re

# 使用re.DEBUG查看正则表达式解析过程
pattern = r'\b\w+\b'
re.compile(pattern, re.DEBUG)

# 分步构建复杂的正则表达式
# 第一步:匹配简单的单词
simple = re.compile(r'\w+')

# 第二步:添加边界
with_boundary = re.compile(r'\b\w+\b')

# 第三步:添加长度限制
final = re.compile(r'\b\w{4,}\b')

# 使用在线工具测试
# regex101.com, regexr.com, pythex.org

# 打印匹配结果用于调试
text = "Hello World"
match = re.search(r'\b\w+\b', text)

if match:
print(f"Match: {match.group()}")
print(f"Start: {match.start()}, End: {match.end()}")
print(f"Span: {match.span()}")

总结

正则表达式是Python中强大的文本处理工具:

核心概念

- **re模块**:提供正则表达式支持
- **原始字符串**:使用r''避免转义问题
- **元字符**:特殊的字符(. ^ $ * + ? { } [ ] \ | ( ))
- **字符类**:\d \s \w 及其大写形式
- **量词**:* + ? {n} {n,m} 控制重复次数
- **锚点**:^ $ \b \B 匹配位置
- **分组**:()捕获, (?:)非捕获, (?P<name>)命名分组
- **断言**:先行和后行断言实现复杂匹配

最佳实践

  1. 使用原始字符串定义正则表达式
  2. 编译常用模式提升性能
  3. 使用非贪婪匹配避免过度匹配
  4. 使用非捕获组优化性能
  5. 添加注释提高可读性
  6. 编写测试用例验证正确性
  7. 对于复杂数据结构考虑专用解析器

常用场景

  • 表单验证(邮箱、手机号、URL等)
  • 日志分析和数据提取
  • 文本清理和规范化
  • 数据爬取和信息提取
  • 搜索和替换操作

掌握正则表达式需要大量练习,建议从简单模式开始,逐步构建复杂匹配规则。