Commit 965d623f authored by Josiah Wang's avatar Josiah Wang
Browse files

Added files and solutions for Regular Expressions exercises

parent 46657d35
""" Regular expressions: Challenge 1 - Experiments
Author: Josiah Wang
"""
import re
import sys
def load_report(filename):
""" Read in scikit-learn report from a text file.
Args:
filename (str) : Path to file containing the scikit-learn report
Returns:
str : Content of the file
"""
with open(filename) as report_file:
content = report_file.read()
return content
def extract_accuracy(content):
""" Extract accuracy from a scikit-learn report.
Args:
content (str) : Content of the report.
Returns:
float : The accuracy
"""
pattern = "accuracy\s*(\d.\d+)\s*(?:\d+)"
match = re.search(pattern, content)
if match is not None:
return float(match.group(1))
else:
return 0.
def extract_f1(content):
""" Prints the f1-score from a scikit-learn report.
Args:
content (str) : Content of the report.
"""
pattern = "(\d+)\s*(?:\d.\d+)\s*(?:\d.\d+)\s*(\d.\d+)\s*(?:\d+)"
iterator = re.finditer(pattern, content)
for match in iterator:
print(f"f1-score for class {match.group(1)} = {match.group(2)}")
if __name__ == "__main__":
if len(sys.argv) > 1:
filename = sys.argv[1]
else:
filename = "classification1.txt"
content = load_report(filename)
accuracy = extract_accuracy(content)
print(accuracy)
extract_f1(content)
\ No newline at end of file
""" Regular expressions: Challenge 2 - Grocery shopping
Author: Josiah Wang
"""
import re
import sys
def load_receipt(filename):
""" Read in receipt from a text file.
Args:
filename (str) : Path to file containing the receipt
Returns:
str : Content of the file
"""
with open(filename) as report_file:
content = report_file.read()
return content
def extract_groceries(receipt):
""" Extract grocery items from a receipt
Args:
receipt (str) : Content of the receipt.
Returns:
list[tuple] : List of (item name, price)
"""
pattern = "([\w ]+?)\s*\d+\s*£(\d+.\d+)[DV]"
iterator = re.finditer(pattern, receipt)
groceries = [(match.group(1), float(match.group(2))) for match in iterator]
return groceries
def extract_total(receipt):
""" Extract total from a receipt
Args:
receipt (str) : Content of the receipt.
Returns:
float : total as listed on the receipt
"""
pattern = "TOTAL\s*£(\d+.\d+)"
match = re.search(pattern, receipt)
if match is not None:
return float(match.group(1))
else:
return 0.
if __name__ == "__main__":
if len(sys.argv) > 1:
filename = sys.argv[1]
else:
filename = "receipt.txt"
receipt = load_receipt(filename)
groceries = extract_groceries(receipt)
print(groceries)
listed_total = extract_total(receipt)
print(listed_total)
calculated_total = sum([price for (name, price) in groceries])
print(calculated_total)
assert abs(listed_total - calculated_total) < 0.000000001
\ No newline at end of file
""" Regular expressions: Challenge 3 - Harvesting emails
Author: Josiah Wang
"""
import re
import sys
def load_email(filename):
""" Read in email from a text file.
Args:
filename (str) : Path to file containing the email
Returns:
str : Content of the email
"""
with open(filename) as report_file:
content = report_file.read()
return content
def extract_recipients(email):
""" Find the To: field and extract the recipients list
Args:
email (str) : Content of the email.
Returns:
str : the recipients list
"""
pattern = "To: (.*)\n"
match = re.search(pattern, email)
if match is not None:
return match.group(1)
else:
return ""
def print_student_emails(email):
""" Print list of students and their emails
Args:
email (str) : Content of the email.
Returns:
float : total as listed on the receipt
"""
# Find the recipients list from the TO: field first
recipients = extract_recipients(email)
# Then extract each recipient from the list
pattern = '"([A-Za-z ]+)" <(.*?)>'
iterator = re.finditer(pattern, recipients)
for match in iterator:
print(f"{match.group(1)}, {match.group(2)}")
if __name__ == "__main__":
if len(sys.argv) > 1:
filename = sys.argv[1]
else:
filename = "email.txt"
email = load_email(filename)
print_student_emails(email)
\ No newline at end of file
""" Regular expressions: Challenge 4 - Screen scraper
Author: Josiah Wang
"""
import re
import sys
def load_html(filename):
""" Read in HTML source from a text file.
Args:
filename (str) : Path to file containing the HTML source
Returns:
str : Content of the HTML source
"""
with open(filename) as report_file:
# Need to make sure that whitespace is removed from start and end of
# line for easier regex matching
content = "\n".join([line.strip() for line in report_file])
return content
def print_staff_info(html):
""" Extract and print staff info from HTML source
Args:
html (str) : Content of the HTML source.
"""
# Regex is sensitive to space!
pattern = re.compile('''
<div class="name-wrapper">
<h4>Personal details</h4>
(?:<a class="name-link" target="_blank" href="(.*?)">)?<span class="person-name">(.*?)</span>(?:</a>)?
<span class="job-title">(?:.*?)</span>
<p class="contact"><a class="email" href="mailto:(.*?)">Send email</a>(?:<span class="tel">(?:.*?)</span>)?</p>
</div>''')
matches = pattern.finditer(html)
for match in matches:
print(f"{match.group(2)}, {match.group(3)}, {match.group(1)}")
if __name__ == "__main__":
if len(sys.argv) > 1:
filename = sys.argv[1]
else:
filename = "staff.txt"
html = load_html(filename)
print_staff_info(html)
\ No newline at end of file
precision recall f1-score support
0 0.90 0.95 0.92 19
1 0.91 0.77 0.83 13
2 0.86 0.92 0.89 13
accuracy 0.89 45
macro avg 0.89 0.88 0.88 45
weighted avg 0.89 0.89 0.89 45
precision recall f1-score support
0 0.97 0.80 0.88 75
1 0.80 0.85 0.82 52
2 0.83 0.93 0.87 41
3 0.82 0.81 0.82 52
4 0.79 0.88 0.83 51
accuracy 0.85 271
macro avg 0.84 0.85 0.84 271
weighted avg 0.85 0.85 0.85 271
MIME-Version: 1.0
Date: Wed, 21 Oct 2020 11:02:52 +0100
Message-ID: <AdJIE+ww21321@xx.yy.com>
Subject: Hello Pythonistas
From: The Boss <the.boss@abc.ac.uk>
To: "Don Garrett" <don.garrett@food.bz>, "Daryl Sears" <daryl.sears@defg.com>, "Szymon Walmsley" <szymon.walmsley@ijk.ac.uk>, "Frances Hurst" <frances.hurst@silly.org>, "Teegan Moses" <teegan.moses@silly.org>, "Klay Prentice" <klay.prentice@abc.ac.uk>, "Campbell Bowden" <campbell.bowden@pretty.edu>, "Karan Cuevas" <karan.cuevas@pretty.edu>, "Heidi Gardner" <heidi.gardner@somewhere.co.uk>, "Phoenix Mcclain" <phoenix.mcclain@abc.ac.uk>, "Bertha Connor" <bertha.connor@something.info>, "Kimberly Poole" <kimberly.poole@email.net>, "Jamil Munro" <jamil.munro@silly.org>, "Anton Britton" <anton.britton@silly.org>, "Tasha Harvey" <tasha.harvey@bba.com>, "Alexie Cantu" <alexie.cantu@ijk.ac.uk>, "Philip Chung" <philip.chung@pretty.edu>, "Kaisha Pearson" <kaisha.pearson@food.bz>, "Malika Huber" <malika.huber@bba.com>, "Jayce Aguilar" <jayce.aguilar@abc.ac.uk>
Content-Type: text/plain; charset="UTF-8"
Hi Pythonistas,
Welcome to our class! I hope you enjoy this task!
Best wishes,
The Boss
--
The Boss does not need a signature.
ASDA SUPERMARKETS LTD.
WWW.ADSA.COM
TELEPHONE 0123 456 7890
W MILK 4 PT 000002033217 £1.09D
CANNED FRUIT 002400000143 £1.00D
FRESH CREAM 505043902070 £1.85D
BLUEBERRIES 505085498023 £1.56D
GARLIC 000000001783 £0.78D
SALAD 505478123059 £0.60D
ICE LOLLIES 506002991123 £2.00V
FROZEN VEG 505085427534 £1.10D
BM MADELEINES 502978543211 £2.00D
TOTAL £11.98
CARD £11.98
TOTAL EPS £11.98
24/10/20 17:12:53
CHANGE DUE £0.00
PLEASE KEEP THIS COPY FOR YOUR RECORDS
No. ITEMS SOLD 9
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment