Regular Expression 1
Regular Expression 1
sequences
of characters within strings.
^: Matches the beginning of a string or the beginning of a line in
multiline mode.
$: Matches the end of a string or the end of a line in multiline mode.
\A: Matches the beginning of a string (similar to ^ but does not
depend on multiline mode).
\Z: Matches the end of a string (similar to $ but does not depend on
multiline mode).
.: Matches any single character except newline characters \n.
*: Matches zero or more occurrences of the preceding character or
group.
+: Matches one or more occurrences of the preceding character or
group.
?: Matches zero or one occurrence of the preceding character or group
(makes it optional).
[]: Defines a character class, matches any single character inside the
brackets.
|: Acts as a logical OR, matches either the pattern on its left or the
pattern on its right.
(): Groups patterns together, allows applying quantifiers to multiple
characters or patterns.
\: Escapes metacharacters, allows using them as literal characters
(e.g., \., \\).
Unit 2, Chapter 2
Regular Expressions match and search methods
print("python" in txt)
print(txt.find("hon"))
txt.find("xyz")
import re
txt="Python is easy programming language"
print("Match method")
print(re.match("Perl",txt))
print()
print("Search method")
print(re.search("Perl",txt))
print("Match method")
print(re.match("Perl",txt))
print()
print("Search method")
print(re.search("Perl",txt))
print(re.match('P\w\w\w\w\w',txt))
#\w-alphanumeric- alphabet, number, underscore
print(re.match('p\w\w\w\w\w',txt))
print(re.search('P\w\w\w\w\w',txt))
print(re.search('p\w\w\w\w\w',txt))
txt="RE_VA university"
print(re.match('R\w\w',txt))
txt="RE-VA University"
print(re.match('R\w\w',txt))
sword=re.search('P\w+',txt)
print(sword)
print()
print(sword.group())
mword=re.match('P\w+',txt)
print(mword)
print()
print(mword.group())
#replacing a string
#to replace the string by replace method, no package needed
txtre="Red Lorry Yellow Lorry"
xre=txtre.replace("Lorry","Bus")
print(xre)
print(txtre)
#split method
import re
txt="Red Lorry Yellow Lorry"
x=re.split("\s",txt)
print(x)
txt="Red+Lorry-Yellow#Lorry"
x=re.split("[-+#]",txt)
print(x)
txt="RedBigLorry-YellowSmallLorry"
x=re.split("Big|Small|-",txt)
print(x)
txt="RedBigLorry-YellowSmallLorry"
x=re.split("[BigSmall-]",txt)
print(x)
txt="Red Lorry Yellow Lorry"
print(re.findall("Lorry",txt))
x=re.findall("Lorry",txt)
print(x)
print(re.search("Lorry",txt))
import re
txt="Python and Perl are programming languages"
print(re.findall('P\w+',txt))
import re
txt="22-March_ 20"
txt="4th semester"
x=re.findall("[sem]",txt)
print(x)
print(re.findall("sem",txt))
print(re.match("sem",txt))
print(re.search("sem",txt))
re.findall("\d",txt)
re.findall("^4",txt)
# ^ search for the occurence of the expression at the beginning of the
string
re.findall("4",txt)
re.findall("\A4",txt)
# same as ^, \A specifically matches only at the beginning of the
entire string
^ matches the beginning of the string and also matches the beginning
of
each line in a multiline string if the re.MULTILINE flag is not set.
\A strictly matches the beginning of the string and does not consider
the beginning of each line in a multiline string.
meta characters
. - any one character
+ - one time or more times
? - zero or one
* - zero or one or many
import re
result=re.findall('.','Python, PHP and Perl are Programming
Language.')
print (result)
import re
result=re.findall('\w','Python, PHP and Perl are Programming
Language.')
print (result)
import re
result=re.findall('\w+','Python, PHP and Perl are Programming
Languages')
print (result)
#If we will use “$” instead of “^”, it will return the word from the
end
import re
result=re.findall('\w+$','Python, PHP and Perl are Programming
Languages')
print (result)
result=re.findall('\w+$','Python, PHP and Perl are Programming
Languages.')
print (result)
raw_s = r'Hi\nHello'
print(raw_s)
s ='Hi\xHello'
print(s)
s = r'Hi\xHello'
print(s)
import re
txt="Python\tprogramming"
print(txt)
print(re.search("on\s",txt))
txt=r"Python\tprogramming"
print(txt)
print(re.search("on\s",txt))
# \b
# Returns a match where the specified characters are at the beginning
or at the end of a word
# (the "r" in the beginning is making sure that the string is being
treated as a "raw string")
import re
txt="The cat sat on The mat!"
#check if the string starts with "The"
# ^ beginning, $ end
import re
tnt="the clever fox\nthe not so clever fox\nthe blue box"
print(re.findall("^the",tnt,re.MULTILINE))
tnt="the clever fox \n the not so clever fox \n the blue box"
print(re.findall("^the",tnt,re.MULTILINE))
tnt="the clever fox \n the not so clever fox \n the blue box"
print(re.findall("^the",tnt,re.MULTILINE))
import re
txt="Python is a easy programming language"
sword=re.search("prog\w+\s\w+",txt)
#sword=re.search("prog\w+",txt)
print(sword)
print(sword.group())
if sword:
print("Substring found")
print(sword.group())
else:
print("Not found")
Repeaters : * , + and { } : These symbols act as repeaters and tell the computer that the
preceding character is to be used for more than just one time.
•
– one or more than one time
•
– zerp or more times ? - optional ^ - must start at beginning of the string $ - must
end with {} - as many times as the value inside this bracket {2} means that the
preceding character is to be repeated 2 times, {min,} means the preceding
character is matches min or more times. {min,max} means that the preceding
character is repeated atleast min & at most max times
re.split('\W','Hello,hello-hello_hello')
re.split('\W','Hello,hello-hello_hello',maxsplit=1)
re.split('\W','Hello,hello-hello_hello',maxsplit=2)
re.split('\W','Hello,hello-hello_hello',maxsplit=3)
re.split('\W','Hello,hello-hello_hello',maxsplit=4)
# \b
import re
txt="The rain in spain"
x=re.findall(r"ain\b",txt)
print(x)
if x:
print("Yes, match found")
else:
print("Not found")
['ain', 'ain']
Yes, match found
x=re.findall('@(\w+.\w+)',str)
print(x)
x=re.findall('@\w+.(\w+)',str)
print(x)
1. r'@\w+.\w+
2. r'@\w+.(\w+)
str= 'Amit 34-3456 12-05-2007, XYZ 56-4532 11-11-2011, ABC 67-8945 12-
01-2009'
1. r'\b[aeiouAEIOU]\w+
2. r'\b[^aeiouAEIOU ]\w+
# Street Address
print(s[-4])
R
# but want to replace only last occurence
s = '100 NORTH BROAD ROAD'
s[:-4] + s[-4:].replace('ROAD', 'RD.')
s[-4:]+s[-5:].replace('ROAD','RD.')
print(s)
re.sub('ROAD$', 'RD.', s)
+919123456789
import re
str1="This is my number +919123456789 and +919999999999"
res=re.findall('\+91\d{10}',str1)
#print(res)
if (res):
print("pattern matched")
print(res)
else:
print("Not matched")
pattern matched
['+919123456789', '+919999999999']
import re
str1="This is my number +919123456789 and +91-9999999999"
res=re.findall('\+91\-?\d{10}',str1)
#print(res)
if (res):
print("pattern matched")
print(res)
else:
print("Not matched")
pattern matched
['+919123456789', '+91-9999999999']
100 - C 150 - CL
200 - CC 250 - CCL
300 - CCC 350 - CCCL
400 - CD 450 - CDL
500 - D 550 - DL
600 - DC 650 - DCL
700 - DCC 750 - DCCL
800 - DCCC 850 - DCCCL
900 - CM 950 - CML
import re
ronum=input("enter roman number to match with hundred")
#?- 0 or 1 occurence-900|400|100,200,300,500,600,700,800
if(re.search('^(CM|CD|D?C?C?C?)$',ronum)):
print(ronum," is a valid roman number")
print("and match with hundred and multiple of hundreds")
else:
print (ronum, "is not mathcing with hundred in roman number")
import re
ronum=input("enter roman number to match with hundred")
#?- 0 or 1 occurence-900|400|100,200,300,500,600,700,800
if(re.search('^(CM|CD|D?C{0,3})L?$',ronum)):
print(ronum," is a valid roman number")
print("and match with hundred and multiple of hundreds")
else:
print (ronum, "is not mathcing with hundred in roman number")
# pattern = '^M?M?M?M?(CM|CD|D?C?C?C?)(XC|XL|L?X?X?X?)(IX|IV|V?I?I?
I?)$'
# pattern = '^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})
$'
# very complicated
# chances are there to forget
# comments and white spaces are ignored
pattern = """
^ # beginning of string
M{0,4} # thousands - 0 to 4 M's
(CM|CD|D?C{0,3}) # hundreds - 900 (CM), 400 (CD), 0-300 (0
to 3 C's),
# or 500-800 (D, followed by 0 to 3 C's)
(XC|XL|L?X{0,3}) # tens - 90 (XC), 40 (XL), 0-30 (0 to 3
X's),
# or 50-80 (L, followed by 0 to 3 X's)
(IX|IV|V?I{0,3}) # ones - 9 (IX), 4 (IV), 0-3 (0 to 3
I's),
# or 5-8 (V, followed by 0 to 3 I's)
$ # end of string
"""
Apply the phone pattern search method to search the following phone
patters.
i) 800-555-1212 ext. 1234
ii.) work 1-(800) 555.1212 #1234
iii) 800-555-1212
iv) 800.555.1212 Each one
2 marks
import re
def search_phone_patterns(text):
# Define the regular expression pattern for phone numbers
pattern = r'\b(?:\d{3}[-.]|\(\d{3}\)\s*)\d{3}[-.]\d{4}\b(?:\
s*ext\. \d+)?'