0% found this document useful (0 votes)
12 views

Regular Expression 1

Uploaded by

Prasad Dhumale
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
12 views

Regular Expression 1

Uploaded by

Prasad Dhumale
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 17

Metacharacters allow you to construct patterns that match specific

sequences
of characters within strings.
^: Matches the beginning of a string or the beginning of a line in
multiline mode.
$: Matches the end of a string or the end of a line in multiline mode.
\A: Matches the beginning of a string (similar to ^ but does not
depend on multiline mode).
\Z: Matches the end of a string (similar to $ but does not depend on
multiline mode).
.: Matches any single character except newline characters \n.
*: Matches zero or more occurrences of the preceding character or
group.
+: Matches one or more occurrences of the preceding character or
group.
?: Matches zero or one occurrence of the preceding character or group
(makes it optional).
[]: Defines a character class, matches any single character inside the
brackets.
|: Acts as a logical OR, matches either the pattern on its left or the
pattern on its right.
(): Groups patterns together, allows applying quantifiers to multiple
characters or patterns.
\: Escapes metacharacters, allows using them as literal characters
(e.g., \., \\).
Unit 2, Chapter 2
Regular Expressions match and search methods

# to check whether a particular string is present in input string or


not
# we can use in operator
txt="Python is easy programming language"
print("Python" in txt)

print("python" in txt)

print(txt.find("hon"))

txt.find("xyz")
import re
txt="Python is easy programming language"

print("Match method to check python present are not")


print(re.match("Python",txt))
print()
print("Search method")
print(re.search("Python",txt))

print("Match method")
print(re.match("Perl",txt))
print()
print("Search method")
print(re.search("Perl",txt))

print("Match method to check python present are not")


print(re.match("programming",txt))
print()
print("Search method")
print(re.search("programming",txt))

# match method - checks only in the beginning,


#if it is not present in the beginning it will return you None

# search method - check in any part of the string


#if it is not present, it returns None

print("Match method")
print(re.match("Perl",txt))
print()
print("Search method")
print(re.search("Perl",txt))

print(re.match('P\w\w\w\w\w',txt))
#\w-alphanumeric- alphabet, number, underscore

print(re.match('p\w\w\w\w\w',txt))

print(re.search('P\w\w\w\w\w',txt))

print(re.search('p\w\w\w\w\w',txt))

txt="RE_VA university"
print(re.match('R\w\w',txt))

txt="RE-VA University"
print(re.match('R\w\w',txt))

txt="Python is easy programming language"


print(re.search('p\w+',txt))
txt="Python is easy p+rogramming language"
print(re.search('p\w+',txt))
txt="Python and Perl are programming languages"
print(re.search('P\w+',txt))

sword=re.search('P\w+',txt)
print(sword)
print()
print(sword.group())

mword=re.match('P\w+',txt)
print(mword)
print()
print(mword.group())

#replacing a string
#to replace the string by replace method, no package needed
txtre="Red Lorry Yellow Lorry"
xre=txtre.replace("Lorry","Bus")
print(xre)
print(txtre)

# replacing the string with Sub (substitution) method


# to use sub method import re
#make a note of syntax difference between sub and replace methods
import re
txt="Red Lorry Yellow Lorry"
x=re.sub("Lorry","Bus",txt)
print(txt)
print(x)

#split method
import re
txt="Red Lorry Yellow Lorry"
x=re.split("\s",txt)
print(x)

txt="Red Lorry,Yellow Lorry"


x=re.split(",",txt)
print(x)

txt="Red+Lorry-Yellow#Lorry"
x=re.split("[-+#]",txt)
print(x)

txt="RedBigLorry-YellowSmallLorry"
x=re.split("Big|Small|-",txt)
print(x)

txt="RedBigLorry-YellowSmallLorry"
x=re.split("[BigSmall-]",txt)
print(x)
txt="Red Lorry Yellow Lorry"
print(re.findall("Lorry",txt))
x=re.findall("Lorry",txt)
print(x)

print(re.search("Lorry",txt))

import re
txt="Python and Perl are programming languages"
print(re.findall('P\w+',txt))

txt="Python, PHP_ and Perl are programming languages"


print(re.findall('P\w\w\w',txt))

import re
txt="22-March_ 20"

# find all alphabet characters between "a" and "z


x=re.findall("[a-z]",txt)
print(x)

# find all lower case characters between"a" and "d"


x=re.findall("[a-d]",txt)
print(x)

print(re.findall("hello world", "hello"))


print(re.findall("hello","hello world"))

txt="4th semester"
x=re.findall("[sem]",txt)
print(x)
print(re.findall("sem",txt))
print(re.match("sem",txt))
print(re.search("sem",txt))

re.findall("\d",txt)

re.findall("^4",txt)
# ^ search for the occurence of the expression at the beginning of the
string

re.findall("4",txt)

re.findall("\A4",txt)
# same as ^, \A specifically matches only at the beginning of the
entire string

^ matches the beginning of the string and also matches the beginning
of
each line in a multiline string if the re.MULTILINE flag is not set.
\A strictly matches the beginning of the string and does not consider
the beginning of each line in a multiline string.

txt="Python, PHP and Perl are Programming Languages"


print(re.findall('P\w\w\w',txt))

txt="Python, PHP and Perl are Programming Languages"


print(re.match('p\w\w\w',txt))

txt="Python, PHP and Perl are Programming Languages"


print(re.search('P\w\w\w',txt))

meta characters
. - any one character
+ - one time or more times
? - zero or one
* - zero or one or many

import re
result=re.findall('.','Python, PHP and Perl are Programming
Language.')
print (result)

import re
result=re.findall('\w','Python, PHP and Perl are Programming
Language.')
print (result)

#extract each word (* or +)


import re
result=re.findall('\w*','Python, PHP and Perl are Programming
Languages')
print (result)

import re
result=re.findall('\w+','Python, PHP and Perl are Programming
Languages')
print (result)

#fetch first word


result=re.findall('^\w+','Python, PHP and Perl are Programming
Languages')
print (result)

#fetch last word

#If we will use “$” instead of “^”, it will return the word from the
end
import re
result=re.findall('\w+$','Python, PHP and Perl are Programming
Languages')
print (result)
result=re.findall('\w+$','Python, PHP and Perl are Programming
Languages.')
print (result)

#Return the first two character of each word

#Extract consecutive two characters of each word, excluding spaces


(using “\w“)

result=re.findall('\w\w','Python, PHP and Perl are Programming


Languages')
print (result)

result1=re.findall('\w.','Python, PHP and Perl are Programming


Languages')
print (result1)

#Extract consecutive two characters those available at start of word


boundary (using “\b“)

result1=re.findall(r'\b\w{2}','Python, PHP and Perl are Programming


Languages')
#or
#result1=re.findall(r'\b\w\w','Python, PHP and Perl are Programming
Languages')
print (result1)

#Python raw string is created by prefixing a string literal with ‘r’


or ‘R’.
#Python raw string treats backslash (\) as a literal character.
#This is useful when we want to have a string that contains backslash
#and don’t want it to be treated as an escape character.
s = 'Hi\nHello'
print(s)

raw_s = r'Hi\nHello'
print(raw_s)

s ='Hi\xHello'
print(s)

s = r'Hi\xHello'
print(s)

txt="trying to print \n character"


print(txt)

txt="trying\\ to print \\n char\\acter"


print(txt)
txt="trying to print \\n character \t without \\ "
print(txt)

import re
txt="Python\tprogramming"
print(txt)
print(re.search("on\s",txt))

txt=r"Python\tprogramming"
print(txt)
print(re.search("on\s",txt))

# \b
# Returns a match where the specified characters are at the beginning
or at the end of a word
# (the "r" in the beginning is making sure that the string is being
treated as a "raw string")

import re
txt="The cat sat on The mat!"
#check if the string starts with "The"

#check if the string starts with "the"

#check if the string ends with "mat"

#check if the string ends with"mat!"

# ^ beginning, $ end

import re
tnt="the clever fox\nthe not so clever fox\nthe blue box"
print(re.findall("^the",tnt,re.MULTILINE))

tnt="the clever fox \n the not so clever fox \n the blue box"
print(re.findall("^the",tnt,re.MULTILINE))

tnt="the clever fox \n the not so clever fox \n the blue box"
print(re.findall("^the",tnt,re.MULTILINE))

tnt="clever fox \n the not so clever fox \n the blue box"


print(re.findall("^the",tnt,re.MULTILINE))

tnt="clever fox \n the not so clever fox \n the blue box"


print(re.findall("^the",tnt,re.MULTILINE))

#write a program to search for a programmig language from a string


#"Python is a easy programming language"
import re
txt="Python is a easy programming language"
sword=re.search("prog\w+\s\w+",txt)
print(sword)
print(sword.group())

import re
txt="Python is a easy programming language"
sword=re.search("prog\w+\s\w+",txt)
#sword=re.search("prog\w+",txt)
print(sword)
print(sword.group())

if sword:
print("Substring found")
print(sword.group())
else:
print("Not found")

• \W - non alphabet, non digit, not an underscore


• \w - alphabet, digit, an underscore
• \s - space
• \S - not a space
• \d - a digit
• \D - not a digit

Repeaters : * , + and { } : These symbols act as repeaters and tell the computer that the
preceding character is to be used for more than just one time.


– one or more than one time

– zerp or more times ? - optional ^ - must start at beginning of the string $ - must
end with {} - as many times as the value inside this bracket {2} means that the
preceding character is to be repeated 2 times, {min,} means the preceding
character is matches min or more times. {min,max} means that the preceding
character is repeated atleast min & at most max times
re.split('\W','Hello,hello-hello_hello')

re.split('\W','Hello,hello-hello_hello',maxsplit=1)

re.split('\W','Hello,hello-hello_hello',maxsplit=2)

re.split('\W','Hello,hello-hello_hello',maxsplit=3)

re.split('\W','Hello,hello-hello_hello',maxsplit=4)

Python Raw String


https://www.youtube.com/watch?v=kkTZ0EZws9Y&t=89s

Python Regular Expression - 1 - match, search


https://www.youtube.com/watch?v=kDEOZvjazLs&t=21s

Python Regular Expression - 2 - Sub, Split


https://www.youtube.com/watch?v=EYg1TBpWnYA&t=14s

Python Regular Expression - 3 - findall


https://www.youtube.com/watch?v=wWuX1BTaYoY&t=10s

Python Regular Expression - 4 - metacharacters ( ^, $ )


https://www.youtube.com/watch?v=Yukl9rp6xA0&t=4s

Python Regular Expression - 5 - metacharacters ( ^, $, \A, \Z )


https://www.youtube.com/watch?v=u68ZQ-pDz1Y&t=15s

Python Regular Expression - 6 - metacharacters -repetitors ( . + ? *)


https://www.youtube.com/watch?v=nP_BAaCsjS4&t=4s

Python Regular Expression - 7 split function with maxsplit parameter


https://www.youtube.com/watch?v=AI29X9Ok850

Python - Extracting Phone Number From A String


https://www.youtube.com/watch?v=7rmlWK8DD5Y

Python- Roman Number Hundreds & Fifties


https://www.youtube.com/watch?v=UT93oddqoMg

# \b
import re
txt="The rain in spain"

#x=re.findall(r"\bain",txt) #\bin #\brain #ain\b

x=re.findall(r"ain\b",txt)
print(x)
if x:
print("Yes, match found")
else:
print("Not found")

['ain', 'ain']
Yes, match found

#Return the domain type of given email-ids

str= 'abc.test@gmail.com, xyz@test.in, test.first@analyticsvidhya.com,


first.test@rest.biz'

x=re.findall('@(\w+.\w+)',str)

print(x)
x=re.findall('@\w+.(\w+)',str)

print(x)

# Extract all characters after “@”


#1. output: ['@gmail.com', '@test.in', '@analyticsvidhya.com',
'@rest.biz']

['gmail.com', 'test.in', 'analyticsvidhya.com', 'rest.biz']


['com', 'in', 'com', 'biz']

#Return the domain type of given email-ids

str= 'abc.test@gmail.com, xyz@test.in, test.first@analyticsvidhya.com,


first.test@rest.biz'

# Extract all characters after “@”


1. output: ['@gmail.com', '@test.in', '@analyticsvidhya.com',
'@rest.biz']

# Extract only domain name


2. output: ['com', 'in', 'com', 'biz']

1. r'@\w+.\w+
2. r'@\w+.(\w+)

str1= 'abc.test@gmail.com, xyz@test.in,


test.first@analyticsvidhya.com, first.test@rest.biz'
res=re.findall(r'@\w+.\w+',str1)
print(res)

['@gmail.com', '@test.in', '@analyticsvidhya.com', '@rest.biz']

str1= 'abc.test@gmail.com, xyz@test.in,


test.first@analyticsvidhya.com, first.test@rest.biz'
res=re.findall(r'@\w+.(\w+)',str1)
print(res)

['com', 'in', 'com', 'biz']

r'@\w+.\w+': This regular expression matches an @ symbol followed by


one or more word characters (\w+),
then any single character (.), and finally, another word character
(\w+).
The . in the middle matches any character except a newline.

r'@\w+.(\w+)': This regular expression matches an @ symbol followed by


one or more word characters (\w+),
then a literal dot (.), and finally, one or more word characters
(\w+).
However, the difference here is that the parentheses around the
second \w+ create a capturing group.
This means that only the characters matched by the second \w+
(i.e., the characters immediately following the dot)
will be captured as a group.

# Return date from given string

str= 'Amit 34-3456 12-05-2007, XYZ 56-4532 11-11-2011, ABC 67-8945 12-
01-2009'

# Extract complete date


1. Output: ['12-05-2007', '11-11-2011', '12-01-2009']

# Extract only year


2. Output: ['2007', '2011', '2009']

# Return all words of a string those starts with vowel

str= 'AV is largest Analytics community of India'

# Return words starts with vowels


Output: ['AV', 'is', 'Analytics', 'of', 'India']

#Return words starts with consonents


Output: ['largest', 'community']

1. r'\b[aeiouAEIOU]\w+
2. r'\b[^aeiouAEIOU ]\w+

# Street Address

s = '100 NORTH MAIN ROAD'


s.replace('ROAD', 'RD.')

'100 NORTH MAIN RD.'

s = '100 NORTH BROAD ROAD'


# -4-3-2-1
s.replace('ROAD', 'RD.')

'100 NORTH BRD. RD.'

print(s[-4])

R
# but want to replace only last occurence
s = '100 NORTH BROAD ROAD'
s[:-4] + s[-4:].replace('ROAD', 'RD.')

'100 NORTH BROAD RD.'

s[-4:]+s[-5:].replace('ROAD','RD.')
print(s)

100 NORTH BROAD ROAD

re.sub('ROAD$', 'RD.', s)

'100 NORTH BROAD RD.'

#validating phone number


import re
str1="This is my number +919123456789 and +919999999999"
res=re.search('\+91\d{10}',str1)
print(res.group())

+919123456789

import re
str1="This is my number +919123456789 and +919999999999"
res=re.findall('\+91\d{10}',str1)
#print(res)
if (res):
print("pattern matched")
print(res)
else:
print("Not matched")

pattern matched
['+919123456789', '+919999999999']

import re
str1="This is my number +919123456789 and +91-9999999999"
res=re.findall('\+91\-?\d{10}',str1)
#print(res)
if (res):
print("pattern matched")
print(res)
else:
print("Not matched")

pattern matched
['+919123456789', '+91-9999999999']

str2="This is my number is 080-12345678 and account number is KAT080-


22222222 and another number is 080-99999999"
print(re.findall('\d{3}-\d{8}',str2))
print(re.findall(r'\b\d{3}-\d{8}\b',str2))

['080-12345678', '080-22222222', '080-99999999']


['080-12345678', '080-99999999']

100 - C 150 - CL
200 - CC 250 - CCL
300 - CCC 350 - CCCL
400 - CD 450 - CDL
500 - D 550 - DL
600 - DC 650 - DCL
700 - DCC 750 - DCCL
800 - DCCC 850 - DCCCL
900 - CM 950 - CML

import re
ronum=input("enter roman number to match with hundred")
#?- 0 or 1 occurence-900|400|100,200,300,500,600,700,800
if(re.search('^(CM|CD|D?C?C?C?)$',ronum)):
print(ronum," is a valid roman number")
print("and match with hundred and multiple of hundreds")
else:
print (ronum, "is not mathcing with hundred in roman number")

enter roman number to match with hundredCD


CD is a valid roman number
and match with hundred and multiple of hundreds

import re
ronum=input("enter roman number to match with hundred")
#?- 0 or 1 occurence-900|400|100,200,300,500,600,700,800
if(re.search('^(CM|CD|D?C{0,3})L?$',ronum)):
print(ronum," is a valid roman number")
print("and match with hundred and multiple of hundreds")
else:
print (ronum, "is not mathcing with hundred in roman number")

enter roman number to match with hundredCM


CM is a valid roman number
and match with hundred and multiple of hundreds

# Verbose Regular Expressions

# pattern = '^M?M?M?M?(CM|CD|D?C?C?C?)(XC|XL|L?X?X?X?)(IX|IV|V?I?I?
I?)$'
# pattern = '^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})
$'

# very complicated
# chances are there to forget
# comments and white spaces are ignored

pattern = """
^ # beginning of string
M{0,4} # thousands - 0 to 4 M's
(CM|CD|D?C{0,3}) # hundreds - 900 (CM), 400 (CD), 0-300 (0
to 3 C's),
# or 500-800 (D, followed by 0 to 3 C's)
(XC|XL|L?X{0,3}) # tens - 90 (XC), 40 (XL), 0-30 (0 to 3
X's),
# or 50-80 (L, followed by 0 to 3 X's)
(IX|IV|V?I{0,3}) # ones - 9 (IX), 4 (IV), 0-3 (0 to 3
I's),
# or 5-8 (V, followed by 0 to 3 I's)
$ # end of string
"""

re.search(pattern, 'M', re.VERBOSE)

re.search(pattern, 'MCMLXXXIX', re.VERBOSE)

<re.Match object; span=(0, 9), match='MCMLXXXIX'>

#validating phone number


import re
str1="This is my number +919123456789 and +919999999999"
res=re.search('\+91\d{10}',str1)
print(res)

<re.Match object; span=(18, 31), match='+919123456789'>

Apply the phone pattern search method to search the following phone
patters.
i) 800-555-1212 ext. 1234
ii.) work 1-(800) 555.1212 #1234
iii) 800-555-1212
iv) 800.555.1212 Each one
2 marks

import re

def search_phone_patterns(text):
# Define the regular expression pattern for phone numbers
pattern = r'\b(?:\d{3}[-.]|\(\d{3}\)\s*)\d{3}[-.]\d{4}\b(?:\
s*ext\. \d+)?'

# Search for phone number patterns in the text


matches = re.findall(pattern, text)
# Print the matches
print("Phone number patterns found:")
for match in matches:
print(match)

# Given phone patterns


patterns = [
"800-555-1212 ext. 1234",
"work 1-(800) 555.1212 #1234",
"800-555-1212",
"800.555.1212"
]

# Apply phone pattern search method to each pattern


for pattern in patterns:
search_phone_patterns(pattern)

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy