正则表达式

正则表达式(Regular Expression)是一种强大的文本处理工具,用于匹配、查找、替换和分割字符串。Python 通过 re 模块提供正则表达式支持。

re 模块基础

导入模块

import re

# 原始字符串:避免转义字符的干扰
pattern = r'\d+'  # 推荐:使用原始字符串
pattern2 = '\\d+'  # 不推荐:需要双重转义

核心方法对比

import re

text = "Python 3.9, Python 3.10, Python 3.11"

# 1. re.match():只从字符串开头匹配
result = re.match(r'Python', text)
print(result.group())  # Python
print(result.span())    # (0, 6)

result = re.match(r'3\.9', text)
print(result)  # None:不匹配开头

# 2. re.search():扫描整个字符串,找到第一个匹配
result = re.search(r'3\.\d+', text)
print(result.group())  # 3.9

# 3. re.findall():返回所有非重叠匹配
results = re.findall(r'3\.\d+', text)
print(results)  # ['3.9', '3.10', '3.11']

# 4. re.finditer():返回所有匹配的迭代器
for match in re.finditer(r'3\.\d+', text):
    print(f"Found: {match.group()} at {match.span()}")

# 5. re.sub():替换匹配项
new_text = re.sub(r'3\.\d+', '4.0', text)
print(new_text)  # Python 4.0, Python 4.0, Python 4.0

# 6. re.split():根据匹配分割字符串
parts = re.split(r',\s*', text)
print(parts)  # ['Python 3.9', 'Python 3.10', 'Python 3.11']

编译正则表达式

import re

# 编译:提高重复使用效率
pattern = re.compile(r'\b\w{4}\b')  # 匹配四个字母的单词
text = "This is a test code"

# 使用编译后的模式
matches = pattern.findall(text)
print(matches)  # ['This', 'test', 'code']

# 编译时指定标志
pattern = re.compile(r'python', re.IGNORECASE)
print(pattern.findall("I love Python and python"))  # ['Python', 'python']

元字符

特殊字符列表

import re

"""
元字符列表:
.       匹配任意字符(除换行符)
^       匹配字符串开头
$       匹配字符串结尾
*       重复0次或多次
+       重复1次或多次
?       重复0次或1次
{ }     自定义重复次数
[ ]     字符集
\       转义字符
|       或运算
( )     分组
"""

# 转义元字符:匹配特殊字符本身
text = "Price: $100, Discount: 20%"
pattern = r'\$\d+'  # 匹配美元符号
print(re.findall(pattern, text))  # ['$100']

# 点号.:匹配任意字符
text = "cat, cot, cut, c_t, c\nt"
print(re.findall(r'c.t', text))  # ['cat', 'cot', 'cut', 'c_t']
# 点号不匹配换行符
print(re.findall(r'c.t', text, re.DOTALL))  # ['cat', 'cot', 'cut', 'c_t', 'c\nt']

字符匹配

预定义字符类

import re

text = "Python 3.11 released in 2023, price $99.99!"

"""
\d:匹配数字 [0-9]
\D:匹配非数字 [^0-9]
\s:匹配空白 [ \t\n\r\f\v]
\S:匹配非空白
\w:匹配单词字符 [a-zA-Z0-9_]
\W:匹配非单词字符
"""

# 匹配所有数字
numbers = re.findall(r'\d+', text)
print(numbers)  # ['3', '11', '2023', '99', '99']

# 匹配单词
words = re.findall(r'\w+', text)
print(words)  # ['Python', '3', '11', 'released', 'in', '2023', 'price', '99', '99']

# 匹配空白分隔的单词
words = re.findall(r'\S+', text)
print(words)  # ['Python', '3.11', 'released', 'in', '2023,', 'price', '$99.99!']

字符集

import re

"""
[abc]:匹配a、b或c
[^abc]:匹配除a、b、c之外的字符
[a-z]:匹配a到z的小写字母
[A-Z]:匹配A到Z的大写字母
[0-9]:匹配数字
[a-zA-Z]:匹配所有字母
[a-zA-Z0-9]:等价于\w
"""

# 匹配元音字母
text = "Hello World"
vowels = re.findall(r'[aeiouAEIOU]', text)
print(vowels)  # ['e', 'o', 'o']

# 匹配非元音字母
consonants = re.findall(r'[^aeiouAEIOU\W]', text)
print(consonants)  # ['H', 'l', 'l', 'W', 'r', 'l', 'd']

# 匹配大写字母
uppercase = re.findall(r'[A-Z]', text)
print(uppercase)  # ['H', 'W']

# 匹配十六进制数
hex_text = "0x1A 0xFF 0x3C"
hex_numbers = re.findall(r'0x[0-9A-Fa-f]+', hex_text)
print(hex_numbers)  # ['0x1A', '0xFF', '0x3C']

重复匹配

量词

import re

"""
*:重复0次或多次 {0,}
+:重复1次或多次 {1,}
?:重复0次或1次 {0,1}
{n}:重复n次
{n,}:重复至少n次
{n,m}:重复n到m次
"""

text = "a, aa, aaa, aaaa, b"

# *:0次或多次
print(re.findall(r'a*', text))  # 匹配所有a和空字符串

# +:1次或多次
print(re.findall(r'a+', text))  # ['a', 'aa', 'aaa', 'aaaa']

# ?:0次或1次
text2 = "color, colour"
print(re.findall(r'colou?r', text2))  # ['color', 'colour']

# {n}:精确n次
text3 = "123 4567 89"
print(re.findall(r'\d{3}', text3))  # ['123', '456']

# {n,}:至少n次
print(re.findall(r'\d{3,}', text3))  # ['123', '4567']

# {n,m}:n到m次
print(re.findall(r'\d{3,4}', text3))  # ['123', '4567']

实际应用

import re

# 匹配IP地址
ip = "192.168.1.1 10.0.0.1 255.255.255.0"
pattern = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
print(re.findall(pattern, ip))  # ['192.168.1.1', '10.0.0.1', '255.255.255.0']

# 匹配日期(YYYY-MM-DD)
date_text = "2023-01-15, 1999-12-31, 2023-1-1"
pattern = r'\d{4}-\d{2}-\d{2}'
print(re.findall(pattern, date_text))  # ['2023-01-15', '1999-12-31']

# 匹配时间(HH:MM:SS)
time_text = "23:59:59, 00:00:00, 9:30:00"
pattern = r'\d{2}:\d{2}:\d{2}'
print(re.findall(pattern, time_text))  # ['23:59:59', '00:00:00']

位置匹配

锚点

import re

"""
^:匹配字符串开头
$:匹配字符串结尾
\b:匹配单词边界
\B:匹配非单词边界
\A:匹配字符串开头(不受多行影响)
\Z:匹配字符串结尾(不受多行影响)
"""

text = "Python is awesome. I love Python"

# ^:开头
print(re.findall(r'^Python', text))  # ['Python']
print(re.findall(r'^Python', text, re.MULTILINE))  # ['Python']

# $:结尾
print(re.findall(r'Python$', text))  # ['Python']

# \b:单词边界
print(re.findall(r'\bPython\b', text))  # ['Python', 'Python']
print(re.findall(r'\bPyt', text))  # ['Pyt', 'Pyt']

# \B:非单词边界
text2 = "PythonPython"
print(re.findall(r'Python\B', text2))  # ['Python']

# 多行模式
multiline = """Line 1: Python
Line 2: Java
Line 3: Python"""

# 在多行模式下,^和$匹配每行的开头和结尾
print(re.findall(r'^Line \d+', multiline, re.MULTILINE))
# ['Line 1', 'Line 2', 'Line 3']

分组与捕获

捕获分组

import re

# ( ):捕获分组
text = "John: 30 years, Jane: 25 years, Bob: 35 years"

# 提取名字和年龄
pattern = r'(\w+):\s*(\d+)\s*years'
matches = re.findall(pattern, text)
print(matches)
# [('John', '30'), ('Jane', '25'), ('Bob', '35')]

# 使用finditer获取详细信息
for match in re.finditer(pattern, text):
    name = match.group(1)
    age = match.group(2)
    print(f"{name} is {age} years old")

# 嵌套分组
text = "2023-12-25"
pattern = r'((\d{4})-(\d{2})-(\d{2}))'
match = re.search(pattern, text)

print(match.group(0))  # 2023-12-25 (完整匹配)
print(match.group(1))  # 2023-12-25 (第一组)
print(match.group(2))  # 2023 (第二组)
print(match.group(3))  # 12 (第三组)
print(match.group(4))  # 25 (第四组)

# groups():返回所有分组
print(match.groups())  # ('2023-12-25', '2023', '12', '25')

命名分组

import re

# (?P<name>pattern):命名捕获组
text = "user@example.com, admin@test.org"

pattern = r'(?P<username>\w+)@(?P<domain>[\w.]+)'
for match in re.finditer(pattern, text):
    print(f"Username: {match.group('username')}")
    print(f"Domain: {match.group('domain')}")

# groupdict():返回命名分组的字典
match = re.search(pattern, text)
print(match.groupdict())
# {'username': 'user', 'domain': 'example.com'}

# 实际应用:解析日志
log = "2023-12-25 10:30:45 [ERROR] File not found: config.ini"
pattern = r'(?P<date>\d{4}-\d{2}-\d{2}) (?P<time>\d{2}:\d{2}:\d{2}) \[(?P<level>\w+)\] (?P<message>.*)'
match = re.search(pattern, log)

if match:
    print(f"Level: {match.group('level')}")
    print(f"Message: {match.group('message')}")

非捕获分组

import re

# (?:pattern):非捕获分组,只用于分组但不捕获
text = "Python, Java, C++, JavaScript"

# 匹配"语言, "或"语言 and "
pattern = r'\w+(?:, | and )'
print(re.findall(pattern, text))
# ['Python, ', 'Java, ', 'C++ and ']

# 对比捕获分组和非捕获分组
pattern1 = r'(?:\w+), (\w+)'  # 第一组不捕获
pattern2 = r'(\w+), (\w+)'    # 两组都捕获

match1 = re.search(pattern1, text)
match2 = re.search(pattern2, text)

print(match1.groups())  # ('Java',)  # 只捕获第二组
print(match2.groups())  # ('Python', 'Java')  # 捕获两组

反向引用

import re

# \n:引用第n个捕获组
text = "hello hello, world world, test test"

# 匹配重复的单词
pattern = r'(\w+) \1'
print(re.findall(pattern, text))  # ['hello', 'world', 'test']

# 匹配HTML标签(简化版)
html = "<div>Content</div><p>Paragraph</p>"
pattern = r'<(\w+)>.*?</\1>'
print(re.findall(pattern, html))  # ['div', 'p']

# 命名分组的反向引用
text = "start-start, end-end, middle-middle"
pattern = r'(?P<word>\w+)-(?P=word)'
print(re.findall(pattern, text))  # ['start', 'end', 'middle']

贪婪与非贪婪

贪婪匹配(默认)

import re

text = "<div>Content 1</div><div>Content 2</div>"

# 贪婪:*、+、{n,m}默认尽可能多匹配
pattern_greedy = r'<div>.*</div>'
match = re.search(pattern_greedy, text)
print(match.group())
# <div>Content 1</div><div>Content 2</div>

非贪婪匹配

import re

text = "<div>Content 1</div><div>Content 2</div>"

# 非贪婪:*?、+?、??、{n,m}?尽可能少匹配
pattern_lazy = r'<div>.*?</div>'
matches = re.findall(pattern_lazy, text)
print(matches)
# ['<div>Content 1</div>', '<div>Content 2</div>']

# 对比例子
text2 = "a<b>c</b>d<b>e</b>f"

# 贪婪:匹配到最后一个</b>
print(re.search(r'<b>.*</b>', text2).group())
# <b>c</b>d<b>e</b>

# 非贪婪:匹配到第一个</b>
print(re.search(r'<b>.*?</b>', text2).group())
# <b>c</b>

# 实际应用:提取引号内容
text3 = 'He said "Hello" and she replied "Hi there"'
quotes = re.findall(r'"(.*?)"', text3)
print(quotes)  # ['Hello', 'Hi there']

量词对比

import re

text = "a, ab, abb, abbb"

# *:尽可能多
print(re.findall(r'ab*', text))  # ['a', 'ab', 'abb', 'abbb']

# *?:尽可能少
print(re.findall(r'ab*?', text))  # ['a', 'a', 'a', 'a']

# +:尽可能多
print(re.findall(r'ab+', text))  # ['ab', 'abb', 'abbb']

# +?:尽可能少
print(re.findall(r'ab+?', text))  # ['ab', 'ab', 'ab']

# HTML示例
html = '<b>Bold</b> and <i>Italic</i>'

# 贪婪
print(re.findall(r'<.*>', html))
# ['<b>Bold</b> and <i>Italic</i>']

# 非贪婪
print(re.findall(r'<.*?>', html))
# ['<b>', '</b>', '<i>', '</i>']

零宽断言

先行断言(Lookahead)

import re

"""
(?=pattern):正向先行断言,后面是pattern
(?!pattern):负向先行断言,后面不是pattern
"""

# 正向先行断言:匹配后面是特定数字的货币
text = "Price: $100, $200, $300"
pattern = r'\$(?=\d{3})'  # 匹配后面跟3位数字的$
print(re.findall(pattern, text))  # ['$']

# 匹配三位数,且后面是"USD"
text2 = "100 USD, 200 EUR, 300 USD, 400 GBP"
pattern = r'\d{3}(?= USD)'
print(re.findall(pattern, text2))  # ['100', '300']

# 负向先行断言:匹配后面不是特定字符的内容
text3 = "test123, test456, testabc"
pattern = r'test(?!\d)'  # 匹配后面不是数字的test
print(re.findall(pattern, text3))  # ['test'] (testabc中的test)

# 匹配不以.exe结尾的文件
text4 = "file.txt, file.exe, file.pdf, file.doc, file.exe.bak"
pattern = r'\w+\.(?!exe\b)\w+'
print(re.findall(pattern, text4))
# ['file.txt', 'file.pdf', 'file.doc', 'file.exe.bak']

后行断言(Lookbehind)

import re

"""
(?<=pattern):正向后行断言,前面是pattern
(?<!pattern):负向后行断言,前面不是pattern
"""

# 正向后行断言:匹配前面是特定货币符号的数字
text = "$100, €200, $300"
pattern = r'(?<=\$)\d+'  # 匹配前面是$的数字
print(re.findall(pattern, text))  # ['100', '300']

# 匹配引号中的内容(不包括引号)
text2 = 'He said "Hello" and "Goodbye"'
pattern = r'(?<=")\w+(?=")'
print(re.findall(pattern, text2))  # ['Hello', 'Goodbye']

# 负向后行断言:匹配前面不是特定字符的内容
text3 = "100USD, 200EUR, 300GBP"
pattern = r'(?<!\$)\d+'  # 匹配前面不是$的数字
print(re.findall(pattern, text3))  # ['100', '200', '300']

# 匹配不在协议后的://
text4 = "https://example.com, http://test.com, ftp://files.com, string://test"
pattern = r'(?<!http|ftp)s://'
print(re.findall(pattern, text4))  # ['s://'] (只匹配string://中的s://)

# 实际应用:提取变量值
text5 = "name=John, age=30, city=NYC"
pattern = r'(?<==)\w+(?=,?\s*$)'
print(re.findall(pattern, text5))  # ['NYC']

Match 对象

Match 对象方法

import re

text = "Python 3.11 released on 2023-12-25"
pattern = r'(\w+)\s*(\d+\.\d+)'

match = re.search(pattern, text)

if match:
    # group():返回匹配的字符串
    print(match.group())      # Python 3.11 (完整匹配)
    print(match.group(0))     # Python 3.11 (同上)
    print(match.group(1))     # Python (第一组)
    print(match.group(2))     # 3.11 (第二组)

    # groups():返回所有分组(不包括group(0))
    print(match.groups())     # ('Python', '3.11')

    # groupdict():返回命名分组的字典
    pattern_named = r'(?P<lang>\w+)\s*(?P<ver>\d+\.\d+)'
    match_named = re.search(pattern_named, text)
    print(match_named.groupdict())  # {'lang': 'Python', 'ver': '3.11'}

    # start():返回匹配开始位置
    print(match.start())      # 0
    print(match.start(1))     # 0 (第一组开始)
    print(match.start(2))     # 7 (第二组开始)

    # end():返回匹配结束位置
    print(match.end())        # 12
    print(match.end(1))       # 6 (第一组结束)
    print(match.end(2))       # 12 (第二组结束)

    # span():返回(start, end)元组
    print(match.span())       # (0, 12)
    print(match.span(1))      # (0, 6)
    print(match.span(2))      # (7, 12)

    # string:返回被匹配的字符串
    print(match.string)       # Python 3.11 released on 2023-12-25

    # re:返回匹配时使用的Pattern对象
    print(match.re)           # re.compile('(\\w+)\\s*(\\d+\\.\\d+)')

实际应用示例

import re

# 验证和提取URL
url = "https://www.example.com/page?id=123&name=test"
pattern = r'(?P<protocol>https?)://(?P<domain>[\w.]+)(?P<path>/[\w/?&=]*)?'

match = re.search(pattern, url)
if match:
    print(f"Protocol: {match.group('protocol')}")
    print(f"Domain: {match.group('domain')}")
    print(f"Path: {match.group('path') or '/'}")

# 解析命令行参数
command = "copy file1.txt file2.txt /force /verbose"
pattern = r'(\w+)(?:\s+(\S+))*'
match = re.match(pattern, command)

if match:
    cmd = match.group(1)
    args = re.findall(r'/(\w+)', command)
    files = re.findall(r'(\S+\.txt)', command)
    print(f"Command: {cmd}")
    print(f"Files: {files}")
    print(f"Options: {args}")

编译标志

常用标志

import re

"""
re.IGNORECASE / re.I:忽略大小写
re.MULTILINE / re.M:多行模式(^$匹配每行)
re.DOTALL / re.S:点号匹配换行符
re.VERBOSE / re.X:详细模式,允许注释和空白
re.ASCII / re.A:ASCII模式,只匹配ASCII字符
re.UNICODE / re.U:Unicode模式(默认)
re.LOCALE / re.L:本地化模式(已弃用)
"""

# re.IGNORECASE:忽略大小写
text = "Python python PYTHON PyThOn"
pattern = re.compile(r'python', re.IGNORECASE)
print(pattern.findall(text))  # ['Python', 'python', 'PYTHON', 'PyThOn']

# re.MULTILINE:多行模式
text = """Line 1: Python
Line 2: Java
Line 3: Python"""

print(re.findall(r'^Line \d+', text, re.MULTILINE))
# ['Line 1', 'Line 2', 'Line 3']

# re.DOTALL:点号匹配换行符
html = """<div>
Content
</div>"""

print(re.search(r'<div>.*</div>', html, re.DOTALL).group())
# <div>
# Content
# </div>

# re.VERBOSE:详细模式,允许注释和换行
pattern = re.compile(r'''
    \b          # 单词边界
    \w+         # 用户名
    @           # @符号
    [\w.]+      # 域名
    \.          # 点号
    [a-z]{2,}   # 顶级域名
    \b          # 单词边界
''', re.VERBOSE | re.IGNORECASE)

email = "user@example.com"
print(pattern.match(email))  # <re.Match object; span=(0, 16), match='user@example.com'>

# 组合多个标志
text = """Python
python
PYTHON
Java"""

pattern = re.compile(r'^python', re.MULTILINE | re.IGNORECASE)
print(pattern.findall(text))  # ['Python', 'python', 'PYTHON']

常用模式

邮箱验证

import re

def validate_email(email):
    """验证邮箱地址"""
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return bool(re.match(pattern, email))

# 测试
emails = [
    "user@example.com",
    "user.name@example.com",
    "user+tag@example.co.uk",
    "invalid@",
    "@example.com",
    "user@.com"
]

for email in emails:
    print(f"{email:30} -> {validate_email(email)}")

# 提取邮箱
text = "Contact us at info@example.com or support@test.org for help"
emails = re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text)
print(emails)  # ['info@example.com', 'support@test.org']

手机号验证

import re

def validate_phone_cn(phone):
    """验证中国手机号"""
    pattern = r'^1[3-9]\d{9}$'
    return bool(re.match(pattern, phone))

# 测试
phones = [
    "13812345678",  # 有效
    "15912345678",  # 有效
    "12345678901",  # 无效(第二位不是3-9)
    "1381234567",   # 无效(位数不够)
    "138123456789", # 无效(位数太多)
]

for phone in phones:
    print(f"{phone:15} -> {validate_phone_cn(phone)}")

# 脱敏手机号
text = "客服电话:13812345678,投诉电话:15912345678"
pattern = r'(\d{3})\d{4}(\d{4})'
masked = re.sub(pattern, r'\1****\2', text)
print(masked)  # 客服电话:138****5678,投诉电话:159****5678

URL 匹配

import re

def extract_urls(text):
    """提取URL"""
    pattern = r'https?://[\w\-]+(\.[\w\-]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?'
    return re.findall(pattern, text)

text = """
访问我们的网站 https://www.example.com,
文档地址 https://docs.example.com/guide/tutorial.html,
或FTP站点 ftp://files.example.com/downloads
"""

urls = re.findall(r'https?://[\w\-]+(\.[\w\-]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?', text)
print(urls)

# 更精确的URL匹配
pattern = r'(?P<url>https?://(?:[\w-]+\.)+[\w-]+(?:/[\w\-./?%&=]*)?)'
matches = re.finditer(pattern, text)

for match in matches:
    print(match.group('url'))

IP 地址匹配

import re

def validate_ip(ip):
    """验证IPv4地址"""
    pattern = r'^((25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)\.){3}(25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)$'
    return bool(re.match(pattern, ip))

# 测试
ips = [
    "192.168.1.1",    # 有效
    "255.255.255.255", # 有效
    "256.1.1.1",      # 无效(超过255)
    "192.168.1",      # 无效(缺少一段)
    "192.168.1.1.1",  # 无效(多了一段)
]

for ip in ips:
    print(f"{ip:20} -> {validate_ip(ip)}")

# 提取IP地址
log = "User connected from 192.168.1.100 at 10:30, admin from 10.0.0.1 at 11:45"
ips = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', log)
print(ips)  # ['192.168.1.100', '10.0.0.1']

日期时间匹配

import re

# 匹配日期(YYYY-MM-DD)
text = "今天是 2023-12-25, 明天是 2023-12-26"
dates = re.findall(r'\d{4}-\d{2}-\d{2}', text)
print(dates)  # ['2023-12-25', '2023-12-26']

# 验证日期格式
def validate_date(date_str):
    """验证日期格式 YYYY-MM-DD"""
    pattern = r'^(?P<year>\d{4})-(?P<month>0[1-9]|1[0-2])-(?P<day>0[1-9]|[12]\d|3[01])$'
    match = re.match(pattern, date_str)
    if match:
        return match.groupdict()
    return None

print(validate_date("2023-12-25"))
# {'year': '2023', 'month': '12', 'day': '25'}

print(validate_date("2023-13-45"))  # None

# 匹配时间 HH:MM:SS
text = "开始时间: 09:30:00, 结束时间: 17:30:00"
times = re.findall(r'\d{2}:\d{2}:\d{2}', text)
print(times)  # ['09:30:00', '17:30:00']

# 匹配日期时间
datetime_text = "2023-12-25 14:30:45"
pattern = r'(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2})'
match = re.match(pattern, datetime_text)

if match:
    date, time = match.groups()
    print(f"Date: {date}, Time: {time}")

HTML/XML 标签

import re

# 提取HTML标签内容
html = "<div>Hello</div><p>World</p><span>Test</span>"

# 提取标签内容(非贪婪)
pattern = r'<(div|p|span)>(.*?)</\1>'
matches = re.findall(pattern, html)

for tag, content in matches:
    print(f"{tag}: {content}")

# 提取所有标签
tags = re.findall(r'<(\w+)', html)
print(tags)  # ['div', 'p', 'span']

# 提取属性
html2 = '<a href="https://example.com" class="link" target="_blank">Link</a>'

# 提取href属性
href = re.search(r'href="([^"]*)"', html2)
if href:
    print(f"href: {href.group(1)}")  # href: https://example.com

# 提取所有属性
attrs = re.findall(r'(\w+)="([^"]*)"', html2)
print(attrs)
# [('href', 'https://example.com'), ('class', 'link'), ('target', '_blank')]

清理和规范化文本

import re

# 移除多余空白
text = "Hello    World   This   is   a    Test"
cleaned = re.sub(r'\s+', ' ', text)
print(cleaned)  # Hello World This is a Test

# 移除特殊字符
text2 = "Hello, World! @#$%^&*()"
cleaned = re.sub(r'[^\w\s]', '', text2)
print(cleaned)  # Hello World

# 标准化空格和标点
text3 = "Hello , world ! This is a test ."
pattern = r'\s+([,.!?])'
cleaned = re.sub(pattern, r'\1', text3)
print(cleaned)  # Hello, world! This is a test.

# 提取数字
text4 = "Price: $99.99, Discount: 20%, Quantity: 5"
numbers = re.findall(r'\d+\.?\d*', text4)
print(numbers)  # ['99.99', '20', '5']

# 移除HTML标签
html = "<div><p>Hello <b>World</b></p></div>"
text_only = re.sub(r'<[^>]+>', '', html)
print(text_only)  # Hello World

高级技巧

条件匹配

import re

# 使用(?(id/name)yes-pattern|no-pattern)
# 匹配两种格式的日期
text = "2023-12-25 and 12/25/2023"

# 如果以数字开头,匹配 YYYY-MM-DD,否则匹配 MM/DD/YYYY
pattern = r'^(\d)?(?(1)\d{3}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4})'

# 匹配带引号或不带引号的字符串
text2 = '"hello", "world", test'

# 如果有引号,匹配引号内的内容;否则匹配单词
pattern = r'(?:"([^"]*)"|(\w+))'
matches = re.findall(pattern, text2)

for quoted, unquoted in matches:
    if quoted:
        print(f"Quoted: {quoted}")
    elif unquoted:
        print(f"Unquoted: {unquoted}")

回溯控制

import re

# (?>atomic):原子分组,防止回溯
text = "aaaaaaaaaaaa"

# 普通匹配(有回溯)
pattern1 = r'(a+)+a'
match1 = re.match(pattern1, text)

# 原子分组(无回溯,性能更好)
pattern2 = r'(?(>a+))+a'
match2 = re.match(pattern2, text)

# 实际应用:匹配HTML标签
html = "<div>content</div>"

# 使用原子组提高性能
pattern = r'<(?!)(?>(?!<\/?\1>).)*<\/\1>'

递归匹配

import re
import regex  # 需要安装regex模块: pip install regex

# Python的re模块不支持递归,但regex模块支持
# 匹配括号嵌套
text = "(a(b(c)d)e)"

# 使用regex模块的递归模式
pattern = r'\((?:[^()]|(?R))*\)'
matches = regex.findall(pattern, text)
print(matches)  # ['(a(b(c)d)e)']

性能优化

编译正则表达式

import re

# 好的做法:编译后重复使用
pattern = re.compile(r'\b\w{4}\b')

# 在循环中使用
texts = ["word", "test", "code", "python"]
for text in texts:
    matches = pattern.findall(text)
    print(f"{text}: {matches}")

# 不好的做法:每次都重新编译
for text in texts:
    matches = re.findall(r'\b\w{4}\b', text)  # 效率低

避免贪婪回溯

import re

text = "a" * 100 + "b"

# 避免过度贪婪
# 不好:可能导致灾难性回溯
pattern1 = re.compile(r'^a+a+$')

# 更好的方式
pattern2 = re.compile(r'^a+b$')

# 对于HTML等结构化数据,使用专用解析器
import html.parser
# 而不是正则表达式

使用非捕获组

import re

text = "Python, Java, C++, JavaScript"

# 当不需要捕获分组时,使用非捕获组
pattern = r'(?:\w+),\s*(?:\w+)'
match = re.search(pattern, text)

# 对比
pattern_with_capture = r'(\w+),\s*(\w+)'
match_with_capture = re.search(pattern_with_capture, text)

# 非捕获组性能更好,因为不需要保存分组信息

实战案例

日志分析

import re

# 解析Apache访问日志
log_line = '127.0.0.1 - - [25/Dec/2023:10:30:45 +0800] "GET /index.html HTTP/1.1" 200 2326'

pattern = r'''
    (?P<ip>\d+\.\d+\.\d+\.\d+)           # IP地址
    \s+-\s+-\s+                          # 占位符
    \[(?P<timestamp>[^\]]+)\]            # 时间戳
    \s+"(?P<method>\w+)                  # 请求方法
    \s+(?P<path>[^\s]+)                  # 请求路径
    \s+(?P<protocol>[\w/.]+)"            # 协议
    \s+(?P<status>\d{3})                 # 状态码
    \s+(?P<size>\d+)                     # 响应大小
'''

match = re.match(pattern, log_line, re.VERBOSE)

if match:
    data = match.groupdict()
    print(f"IP: {data['ip']}")
    print(f"Path: {data['path']}")
    print(f"Status: {data['status']}")

# 分析日志文件
def analyze_logs(log_file):
    errors = []
    pattern = re.compile(r'\[([^\]]+)\].*"(5\d{2})\s+\d+')

    with open(log_file, 'r') as f:
        for line in f:
            match = pattern.search(line)
            if match:
                errors.append({
                    'timestamp': match.group(1),
                    'status': match.group(2)
                })

    return errors

数据清洗

import re

def clean_phone(phone):
    """清洗和标准化电话号码"""
    # 移除所有非数字字符
    digits = re.sub(r'\D', '', phone)

    # 验证长度
    if len(digits) == 11 and digits.startswith('1'):
        # 格式化为: 1-XXX-XXXX-XXXX
        return f"{digits[0]}-{digits[1:4]}-{digits[4:8]}-{digits[8:]}"

    return None

phones = [
    "138-1234-5678",
    "138 1234 5678",
    "(138) 1234-5678",
    "+86 138 1234 5678"
]

for phone in phones:
    cleaned = clean_phone(phone)
    print(f"{phone:25} -> {cleaned}")

# 清理用户输入
def clean_user_input(text):
    # 移除多余空白
    text = re.sub(r'\s+', ' ', text)
    # 移除特殊字符
    text = re.sub(r'[^\w\s\-@.]', '', text)
    # 去除首尾空白
    text = text.strip()
    return text

user_input = "  Hello!!!   World@#$  "
print(clean_user_input(user_input))  # Hello World@

文本替换

import re

# 敏感词过滤
text = "这是一个测试文本,包含一些敏感词汇"

# 使用字典替换
replacements = {
    '敏感': '***',
    '词汇': '**',
}

pattern = re.compile('|'.join(map(re.escape, replacements.keys())))
filtered = pattern.sub(lambda m: replacements[m.group()], text)
print(filtered)  # 这是一个测试文本,包含一些***

# 模板替换
template = "Hello {name}, welcome to {place}! Your balance is ${balance}."

data = {
    'name': 'Alice',
    'place': 'Python',
    'balance': '100.50'
}

result = template.format(**data)
print(result)  # Hello Alice, welcome to Python! Your balance is $100.50.

# 使用正则替换
pattern = r'\{(\w+)\}'
result2 = re.sub(pattern, lambda m: data.get(m.group(1), m.group()), template)
print(result2)

数据提取

import re

# 从文本中提取结构化数据
text = """
员工信息:
姓名:张三,年龄:25,部门:技术部
姓名:李四,年龄:30,部门:市场部
姓名:王五,年龄:28,部门:技术部
"""

pattern = r'姓名:(?P<name>\w+),年龄:(?P<age>\d+),部门:(?P<dept>\w+)'
employees = re.findall(pattern, text)

print(employees)
# [('张三', '25', '技术部'), ('李四', '30', '市场部'), ('王五', '28', '技术部')]

# 转换为字典列表
employees_list = [
    {'name': name, 'age': int(age), 'dept': dept}
    for name, age, dept in employees
]

print(employees_list)

# 提取表格数据
table_text = """
| ID | Name    | Age |
|----|---------|-----|
| 1  | Alice   | 25  |
| 2  | Bob     | 30  |
| 3  | Charlie | 35  |
"""

rows = re.findall(r'\|\s*(\d+)\s*\|\s*(\w+)\s*\|\s*(\d+)\s*\|', table_text)

for row_id, name, age in rows:
    print(f"ID: {row_id}, Name: {name}, Age: {age}")

最佳实践

使用建议

import re

# 1. 使用原始字符串
pattern = r'\d+'  # 推荐
pattern = '\\d+'  # 不推荐

# 2. 编译常用的正则表达式
COMMON_PATTERNS = {
    'email': re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'),
    'phone': re.compile(r'1[3-9]\d{9}'),
    'ip': re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'),
}

# 使用编译好的模式
def validate_data(text, type_):
    pattern = COMMON_PATTERNS.get(type_)
    if pattern:
        return bool(pattern.match(text))
    return False

# 3. 使用具体量词而非贪婪匹配
# 不好:.*
# 好:.*? 或具体模式

# 4. 使用非捕获组提升性能
# 捕获组: (pattern)
# 非捕获组: (?:pattern)

# 5. 注释复杂的正则表达式
complex_pattern = re.compile(r'''
    ^                     # 字符串开始
    [a-zA-Z0-9._%+-]+     # 用户名
    @                     # @符号
    [a-zA-Z0-9.-]+        # 域名
    \.                    # 点号
    [a-zA-Z]{2,}          # 顶级域名
    $                     # 字符串结束
''', re.VERBOSE)

# 6. 错误处理
try:
    pattern = re.compile(r'(?:invalid')  # 不完整的正则
except re.error as e:
    print(f"正则表达式错误: {e}")

# 7. 测试正则表达式
def test_pattern(pattern, test_cases):
    for test_input, expected in test_cases:
        result = bool(pattern.match(test_input))
        status = "✓" if result == expected else "✗"
        print(f"{status} Input: {test_input:20} Expected: {expected}, Got: {result}")

# 测试邮箱模式
email_pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
test_cases = [
    ("user@example.com", True),
    ("invalid@", False),
    ("@example.com", False),
]

test_pattern(email_pattern, test_cases)

调试技巧

import re

# 使用re.DEBUG查看正则表达式解析过程
pattern = r'\b\w+\b'
re.compile(pattern, re.DEBUG)

# 分步构建复杂的正则表达式
# 第一步:匹配简单的单词
simple = re.compile(r'\w+')

# 第二步:添加边界
with_boundary = re.compile(r'\b\w+\b')

# 第三步:添加长度限制
final = re.compile(r'\b\w{4,}\b')

# 使用在线工具测试
# regex101.com, regexr.com, pythex.org

# 打印匹配结果用于调试
text = "Hello World"
match = re.search(r'\b\w+\b', text)

if match:
    print(f"Match: {match.group()}")
    print(f"Start: {match.start()}, End: {match.end()}")
    print(f"Span: {match.span()}")

总结

正则表达式是Python中强大的文本处理工具:

核心概念

- **re模块**:提供正则表达式支持
- **原始字符串**:使用r''避免转义问题
- **元字符**:特殊的字符(. ^ $ * + ? { } [ ] \ | ( ))
- **字符类**:\d \s \w 及其大写形式
- **量词**:* + ? {n} {n,m} 控制重复次数
- **锚点**:^ $ \b \B 匹配位置
- **分组**:()捕获, (?:)非捕获, (?P<name>)命名分组
- **断言**:先行和后行断言实现复杂匹配

最佳实践

使用原始字符串定义正则表达式
编译常用模式提升性能
使用非贪婪匹配避免过度匹配
使用非捕获组优化性能
添加注释提高可读性
编写测试用例验证正确性
对于复杂数据结构考虑专用解析器

常用场景

表单验证(邮箱、手机号、URL等)
日志分析和数据提取
文本清理和规范化
数据爬取和信息提取
搜索和替换操作

掌握正则表达式需要大量练习,建议从简单模式开始,逐步构建复杂匹配规则。

re 模块基础​

导入模块​

核心方法对比​

编译正则表达式​

元字符​

特殊字符列表​

字符匹配​

预定义字符类​

字符集​

重复匹配​

量词​

实际应用​

位置匹配​

锚点​

分组与捕获​

捕获分组​

命名分组​

非捕获分组​

反向引用​

贪婪与非贪婪​

贪婪匹配(默认)​

非贪婪匹配​

量词对比​

零宽断言​

先行断言(Lookahead)​

后行断言(Lookbehind)​

Match 对象​

Match 对象方法​

实际应用示例​

编译标志​

常用标志​

常用模式​

邮箱验证​

手机号验证​

URL 匹配​

IP 地址匹配​

日期时间匹配​

HTML/XML 标签​

清理和规范化文本​

高级技巧​

条件匹配​

回溯控制​

递归匹配​

性能优化​

编译正则表达式​

避免贪婪回溯​

使用非捕获组​

实战案例​

日志分析​

数据清洗​

文本替换​

数据提取​

最佳实践​

使用建议​

调试技巧​

总结​

核心概念​

最佳实践​

常用场景​

re 模块基础

导入模块

核心方法对比

编译正则表达式

元字符

特殊字符列表

字符匹配

预定义字符类

字符集

重复匹配

量词

实际应用

位置匹配

锚点

分组与捕获

捕获分组

命名分组

非捕获分组

反向引用

贪婪与非贪婪

贪婪匹配(默认)

非贪婪匹配

量词对比

零宽断言

先行断言(Lookahead)

后行断言(Lookbehind)

Match 对象

Match 对象方法

实际应用示例

编译标志

常用标志

常用模式

邮箱验证

手机号验证

URL 匹配

IP 地址匹配

日期时间匹配

HTML/XML 标签

清理和规范化文本

高级技巧

条件匹配

回溯控制

递归匹配

性能优化

编译正则表达式

避免贪婪回溯

使用非捕获组

实战案例

日志分析

数据清洗

文本替换

数据提取

最佳实践

使用建议

调试技巧

总结

核心概念

最佳实践

常用场景