Python XML Stream Processing⚓︎
Simple Example⚓︎
Using etree
we can process an XML file as a stream of events. This is demostrated on a string below however normally it would make sense to process a large file this way.
from lxml import etree
from io import BytesIO
xmlData = '''<?xml version="1.0" encoding="UTF-8"?>
<submission>
<transactionReport type="Domestic Transfer" direction="Incoming">
<amount>8.32</amount>
</transactionReport>
</submission>'''
xmlData = ' '.join(xmlData.split())
for event, element in etree.iterparse(
BytesIO(xmlData.encode("UTF-8")),
events=('end', 'start')):
print({'event': event,
"element.text" : element.text,
"element.tag" : element.tag,
"element.attribute" : element.attrib})
The output of the above code is the following. You can see its a stream of events.
{'event': 'start', 'element.text': ' ', 'element.tag': 'submission', 'element.attribute': {}}
{'event': 'start', 'element.text': ' ', 'element.tag': 'transactionReport', 'element.attribute': {'type': 'Domestic Transfer', 'direction': 'Incoming'}}
{'event': 'start', 'element.text': '8.32', 'element.tag': 'amount', 'element.attribute': {}}
{'event': 'end', 'element.text': '8.32', 'element.tag': 'amount', 'element.attribute': {}}
{'event': 'end', 'element.text': ' ', 'element.tag': 'transactionReport', 'element.attribute': {'type': 'Domestic Transfer', 'direction': 'Incoming'}}
{'event': 'end', 'element.text': ' ', 'element.tag': 'submission', 'element.attribute': {}}
Complex Example⚓︎
The below is a slightly more complex example of the above in the financial intelligence domain. In the below example we build up a list of the information we want to extract in the format we need. Based on the length of the list (or the number of iterations) you can action the list.
Input XML File
xmlData = """<?xml version="1.0" encoding="UTF-8"?>
<submission>
<transactionReport>
<amount>8.32</amount>
<beneficiaryCustomer>
<account>
<branchId>MCGPUS51</branchId>
<network>swift</network>
<number>840424388</number>
</account>
<address>
<fullAddress>C/O Elian Fiduciary Services 89 Nexus Way Camana Bay AU KY1-9007 KY</fullAddress>
</address>
<identification>
<identificationType>lei</identificationType>
<identifier>549300VVQZ04QTONQ475</identifier>
</identification>
<name>
<fullName>HUNTER PEAK INVESTMENTS MASTER FUND, LP</fullName>
</name>
</beneficiaryCustomer>
<orderingCustomer>
<account>
<branchId>632-001</branchId>
<institutionCode>BAE</institutionCode>
<network>swift</network>
<number>346648681</number>
</account>
<address>
<fullAddress>136 St John Street Launceston TAS 7250 AU</fullAddress>
</address>
<identification>
<identificationType>birthDate</identificationType>
<identifier>1972-04-05</identifier>
</identification>
<jobTitle>Air traffic controller</jobTitle>
<name>
<fullName>OPAL FISHNICK</fullName>
</name>
</orderingCustomer>
<reference>6996831613084</reference>
<reportType>internationalFundsTransferInstruction</reportType>
<transactionDatetime>2020-12-23 03:27:19+00:00</transactionDatetime>
</transactionReport>
<transactionReport>
<amount>1317.6</amount>
<beneficiaryCustomer>
<account>
<branchId>632-000</branchId>
<institutionCode>BAE</institutionCode>
<network>swift</network>
<number>088919318</number>
</account>
<address>
<fullAddress>105A York Street Launceston TAS 7250 AU</fullAddress>
</address>
<identification>
<identificationType>australianBusinessNumber</identificationType>
<identifier>141435574</identifier>
</identification>
<name>
<fullName>EFFECTAL MEDIA PTY LTD</fullName>
</name>
</beneficiaryCustomer>
<orderingCustomer>
<account>
<branchId>PASTUS61</branchId>
<network>swift</network>
<number>964607475</number>
</account>
<address>
<fullAddress>C/O Sameer Upadhya 1850 M Street North West Suite 400 Washington US-DC 20036 US</fullAddress>
</address>
<identification>
<identificationType>lei</identificationType>
<identifier>549300JTIWM0TTCM0S45</identifier>
</identification>
<name>
<fullName>Park 7 Residential Limited Partnership</fullName>
</name>
</orderingCustomer>
<reference>3794265963049</reference>
<reportType>internationalFundsTransferInstruction</reportType>
<transactionDatetime>2020-12-21 04:58:34+00:00</transactionDatetime>
</transactionReport>
<transactionReport>
<amount>15.36</amount>
<beneficiaryCustomer>
<account>
<branchId>632-000</branchId>
<institutionCode>BAE</institutionCode>
<network>swift</network>
<number>002299101</number>
</account>
<address>
<fullAddress>10 Coulter Court Launceston TAS 7250 AU</fullAddress>
</address>
<identification>
<identificationType>benefitsCard</identificationType>
<identifier>66305454411</identifier>
</identification>
<jobTitle>Retired</jobTitle>
<name>
<fullName>MERCY BOTTARO</fullName>
</name>
</beneficiaryCustomer>
<orderingCustomer>
<account>
<branchId>BAPPIT21V89</branchId>
<network>swift</network>
<number>629637117</number>
</account>
<address>
<fullAddress> Soresina IT</fullAddress>
</address>
<identification>
<identificationType>birthDate</identificationType>
<identifier>1976-12-31</identifier>
</identification>
<name>
<fullName>Mr B STAMY</fullName>
</name>
</orderingCustomer>
<reference>9737726812342</reference>
<reportType>internationalFundsTransferInstruction</reportType>
<transactionDatetime>2020-12-21 01:08:54+00:00</transactionDatetime>
</transactionReport>
</submission>"""
Just like the first example we iterate over all the events within the XML file. Then build a structure around the events.
from lxml import etree
from io import BytesIO
from rich.pretty import pprint
reports = []
for event, element in etree.iterparse(
BytesIO(xmlData.encode("UTF-8")),
events=('end', 'start')):
#print({'event': event, "element.text" : element.text, "element.tag" : element.tag, "element.attribute" : element.attrib})
## Start Report
if event == 'start' and element.tag == 'transactionReport': ### create report dictionary
report = {"role" :[]}
if event == 'end' and element.tag == 'amount':
report['amount'] = float(element.text)
if event == 'end' and element.tag == 'reference':
report['reference'] = element.text
if event == 'end' and element.tag == 'reportType':
report['reportType'] = element.text
if event == 'end' and element.tag == 'transactionDatetime':
report['transactionDatetime'] = element.text
## Start Role
if event == 'start' and element.tag in [ 'beneficiaryCustomer', 'orderingCustomer']:
roleDetails = {"role" : element.tag,
'name': [],
'address': [],
'account': [],
'identification': [],
}
## Name
if event == 'start' and element.tag == 'name':
name = {}
if event == 'end' and element.tag == 'fullName':
name['fullName'] = element.text
roleDetails['name'].append(name)
## Address
if event == 'start' and element.tag == 'address':
address = {}
if event == 'end' and element.tag == 'fullAddress':
address['fullAddress'] = element.text
roleDetails['address'].append(address)
## Account
if event == 'start' and element.tag == 'account':
account = {}
if event == 'end' and element.tag == 'branchId':
account['branchId'] = element.text
if event == 'end' and element.tag == 'network':
account['network'] = element.text
if event == 'end' and element.tag == 'number':
account['number'] = element.text
if event == 'end' and element.tag == 'account':
roleDetails['account'].append(account)
if event == 'start' and element.tag == 'identification':
identification = {}
if event == 'end' and element.tag == 'identificationType':
identification['identificationType'] = element.text
if event == 'end' and element.tag == 'identifier':
identification['identifier'] = element.text
if event == 'end' and element.tag == 'identifier':
identification['identifier'] = element.text
## Job Title
if event == 'end' and element.tag == 'jobTitle':
roleDetails['jobTitle'] = element.text
## End Role
if event == 'end' and element.tag in [ 'beneficiaryCustomer', 'orderingCustomer']:
report["role"].append(roleDetails)
## End Report
if event == 'end' and element.tag == 'transactionReport': ### add report dictionary to reports
reports.append(report)
pprint(reports[0])
The dictionary generated by the above code looks like this.
{
"role": [
{
"role": "beneficiaryCustomer",
"name": [{"fullName": "HUNTER PEAK INVESTMENTS MASTER FUND, LP"}],
"address": [{"fullAddress": "C/O Elian Fiduciary Services 89 Nexus Way Camana Bay AU KY1-9007 KY"}],
"account": [{"branchId": "MCGPUS51", "network": "swift", "number": "840424388"}],
"identification": []
},
{
"role": "orderingCustomer",
"name": [{"fullName": "OPAL FISHNICK"}],
"address": [{"fullAddress": "136 St John Street Launceston TAS 7250 AU"}],
"account": [{"branchId": "632-001", "network": "swift", "number": "346648681"}],
"identification": [],
"jobTitle": "Air traffic controller"
}
],
"amount": 8.32,
"reference": "6996831613084",
"reportType": "internationalFundsTransferInstruction",
"transactionDatetime": "2020-12-23 03:27:19+00:00"
}