How to Use Regular Expressions in Python
Learn how to use the re module in Python to search, match, and manipulate text with regular expressions.
Regular expressions (regex) are powerful patterns for matching and manipulating text. Python’s re module provides comprehensive regex support.
Importing re
import re
Basic Functions
re.search() - Find First Match
import re
text = "The rain in Spain falls mainly in the plain"
# Search for pattern
match = re.search(r"Spain", text)
if match:
print(f"Found: {match.group()}") # Found: Spain
print(f"Position: {match.start()}-{match.end()}") # Position: 12-17
# No match returns None
result = re.search(r"France", text)
print(result) # None
re.match() - Match at Beginning
import re
text = "Hello World"
# Matches at start
match = re.match(r"Hello", text)
print(match.group()) # Hello
# Won't match (not at start)
match = re.match(r"World", text)
print(match) # None
re.findall() - Find All Matches
import re
text = "cat bat rat cat hat"
# Find all occurrences
matches = re.findall(r"cat", text)
print(matches) # ['cat', 'cat']
# Find all words ending in 'at'
matches = re.findall(r"\b\w+at\b", text)
print(matches) # ['cat', 'bat', 'rat', 'cat', 'hat']
re.sub() - Replace Matches
import re
text = "Hello World"
# Replace pattern
new_text = re.sub(r"World", "Python", text)
print(new_text) # Hello Python
# Replace with function
def uppercase(match):
return match.group().upper()
text = "hello world"
new_text = re.sub(r"\b\w+\b", uppercase, text)
print(new_text) # HELLO WORLD
Common Patterns
Character Classes
import re
text = "abc123XYZ"
# \d - digits
print(re.findall(r"\d", text)) # ['1', '2', '3']
print(re.findall(r"\d+", text)) # ['123']
# \w - word characters (a-z, A-Z, 0-9, _)
print(re.findall(r"\w+", text)) # ['abc123XYZ']
# \s - whitespace
text = "Hello World"
print(re.findall(r"\s", text)) # [' ']
# Custom character class
print(re.findall(r"[aeiou]", "hello world")) # ['e', 'o', 'o']
Quantifiers
import re
# * - zero or more
# + - one or more
# ? - zero or one
# {n} - exactly n
# {n,m} - n to m times
text = "aaa ab a"
print(re.findall(r"a+", text)) # ['aaa', 'a', 'a']
print(re.findall(r"a*", text)) # ['aaa', '', 'a', '', 'a', '']
print(re.findall(r"a{2}", text)) # ['aa']
print(re.findall(r"a{1,3}", text)) # ['aaa', 'a', 'a']
Anchors
import re
text = "hello world"
# ^ - start of string
print(re.search(r"^hello", text)) # Match
print(re.search(r"^world", text)) # None
# $ - end of string
print(re.search(r"world$", text)) # Match
print(re.search(r"hello$", text)) # None
# \b - word boundary
text = "cat category"
print(re.findall(r"\bcat\b", text)) # ['cat'] (not 'category')
Groups
Basic Groups
import re
text = "John Smith, Jane Doe"
# Capture groups with ()
pattern = r"(\w+) (\w+)"
matches = re.findall(pattern, text)
print(matches) # [('John', 'Smith'), ('Jane', 'Doe')]
# Access groups from match object
match = re.search(r"(\w+) (\w+)", text)
print(match.group(0)) # John Smith (entire match)
print(match.group(1)) # John (first group)
print(match.group(2)) # Smith (second group)
Named Groups
import re
text = "john@example.com"
pattern = r"(?P<username>\w+)@(?P<domain>\w+\.\w+)"
match = re.search(pattern, text)
print(match.group("username")) # john
print(match.group("domain")) # example.com
print(match.groupdict()) # {'username': 'john', 'domain': 'example.com'}
Practical Examples
Email Validation
import re
def is_valid_email(email):
pattern = r"^[\w.-]+@[\w.-]+\.\w+$"
return bool(re.match(pattern, email))
print(is_valid_email("user@example.com")) # True
print(is_valid_email("invalid-email")) # False
Phone Number Extraction
import re
text = "Call me at 123-456-7890 or (555) 123-4567"
# Various phone formats
pattern = r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"
phones = re.findall(pattern, text)
print(phones) # ['123-456-7890', '(555) 123-4567']
URL Parsing
import re
url = "https://www.example.com/path/page?query=value"
pattern = r"(?P<protocol>https?)://(?P<domain>[\w.]+)(?P<path>/[\w/]*)?(?P<query>\?.*)?"
match = re.search(pattern, url)
print(match.group("protocol")) # https
print(match.group("domain")) # www.example.com
print(match.group("path")) # /path/page
Text Cleaning
import re
text = "Hello, World! How are you?"
# Remove extra whitespace
clean = re.sub(r"\s+", " ", text)
print(clean) # Hello, World! How are you?
# Remove non-alphanumeric
clean = re.sub(r"[^\w\s]", "", text)
print(clean) # Hello World How are you
Flags
import re
text = "Hello WORLD"
# Case insensitive
match = re.search(r"world", text, re.IGNORECASE)
print(match.group()) # WORLD
# Multiline (^ and $ match line boundaries)
text = "line1\nline2"
matches = re.findall(r"^line\d", text, re.MULTILINE)
print(matches) # ['line1', 'line2']
# Verbose (allows comments and whitespace)
pattern = re.compile(r"""
\d{3} # Area code
[-.\s]? # Separator
\d{3} # First 3 digits
[-.\s]? # Separator
\d{4} # Last 4 digits
""", re.VERBOSE)
Summary
- Use
re.search()to find first match - Use
re.findall()to find all matches - Use
re.sub()to replace matches - Use
()for groups,(?P<name>)for named groups - Common patterns:
\d(digit),\w(word),\s(space) - Quantifiers:
*,+,?,{n},{n,m} - Anchors:
^(start),$(end),\b(word boundary)