ASIC Data Analysis
In this notebook we load up the ASIC dataset and perform some basic analysis.
import requests
import pytz
from datetime import date, datetime
import csv
import io
import os
import requests
import json
import random
import pandas as pd
import re
from wordcloud import WordCloud
from wordcloud import STOPWORDS
import matplotlib.pyplot as plt
import shutil
import plotly.express as px
## Set download URL
url = 'https://data.gov.au/data/dataset/7b8656f9-606d-4337-af29-66b89b2eeefb/resource/cb7e4eb5-ed46-4c6c-97a0-4532f4479b7d/download/company_202104.zip'
## Setup function to stream file to disk
def download_file(url):
local_filename = url.split('/')[-1]
with requests.get(url, stream=True) as r:
with open(local_filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
return local_filename
downloaded_file = download_file(url)
import zipfile
with zipfile.ZipFile(downloaded_file,"r") as zip_ref:
zip_ref.extractall("./")
downloaded_file = 'company_202112.zip'
data = []
with open(downloaded_file.upper().split(".")[0] + ".csv",encoding='ISO-8859-1') as csvfile:
readCSV = csv.reader(csvfile, delimiter='\t')
for index, row in enumerate(readCSV):
if index < 1:
continue
dataRow = {}
dataRow["companyName"] = row[0].strip()
dataRow['AustralianCompanyNumber'] = row[1].strip()
dataRow['type'] = row[2].strip()
dataRow['class'] = row[3].strip()
dataRow['subClass'] = row[4].strip()
dataRow['status'] = row[5].strip()
dataRow['registrationDate'] = row[6].strip()
dataRow['previousRegistrationState'] = row[7].strip()
dataRow['stateRegistrationNumber'] = row[8].strip()
dataRow['modifiedSinceLastReportFlag'] = row[9].strip()
dataRow['currentNameFlag'] = row[10].strip()
dataRow['australianBusinessNumber'] = row[11].strip()
dataRow['currentName'] = row[12].strip()
dataRow['currentNameStartDate'] = row[13].strip()
data.append(dataRow)
# if index > 100:
# break
df = pd.DataFrame (data)
df['registrationDate'] = pd.to_datetime(df['registrationDate'])
df['companyNameClean'] = df['companyName'].map(lambda x: re.sub( r"[^a-zA-Z0-9]+", ' ', x))
type_descriptions = {'APTY' : 'Australian proprietary company',
'APUB' : 'Australian public company',
'FNOS' : 'Foreign company (a company incorporated outside Australia but registered as a foreign company in Australia)',
'NONC' : 'Non-organisation (a body not registered under Corporations Act',
'RACN' : 'Registered Australian Corporation (an organisation incorporated in Australia under a law other than the Corporations Law but required to be registered under the Corporations Law).'
}
class_descriptions = {
'LMSH' : 'Limited by Shares',
'LMGT' : 'Limited by Guarantee',
'LMSG' : 'Limited by Both Shares and Guarantees',
'NLIA' : 'No Liability Company',
'UNLM' : 'Unlimited Company',
'NONE' : 'Does not have an equivalent Australian liability',
}
subClass_descriptions = {
'EXPT' : 'Exempt Proprietary Company',
'HUNT' : 'Proprietary home unit company',
'LISN' : 'Company licensed under Section 383 of the Corporations Act 2001',
'LISS' : 'Company licensed under Section 383 to omit Limitedfrom its name',
'LIST' : 'Listed public company',
'NEXT' : 'Non-Exempt Proprietary Company',
'NLTD' : 'Non-profit public company registered without limited in its name under Section 150',
'NONE' : 'Unknown',
'PNPC' : 'Proprietary non-profit company',
'PROP' : 'Proprietary other',
'PSTC' : 'Proprietary superannuation trustee company',
'PUBF' : 'Foreign company required to lodge a balance sheet',
'RACA' : 'Registrable Australian corporation – association',
'RACO' : 'Registrable Australian corporation - non association',
'STFI' : 'Public company – small transferring financial institution',
'ULSN' : 'Unlisted public - non-profit company',
'ULSS' : 'Unlisted public - superannuation trustee company',
'ULST' : 'Unlisted pubic company'
}
status_descriptions = {'DRGD' : 'De-registered',
'EXAD' : 'External administration (in receivership/liquidation)',
'NOAC' : 'Not active',
'NRGD' : 'Not registered',
'PROV' : 'Provisional',
'REGD' : 'Registered',
'SOFF' : 'Strike-off action in progress',
'DISS' : 'Dissolved by Special Act of Parliament',
'DIV3' : 'Organisation Transferred Registration via DIV3',
'PEND' : 'Pending - Schemes'
}
df = df.replace({"type": type_descriptions, "class" : class_descriptions, 'subClass': subClass_descriptions, 'status': status_descriptions})
df.head(3)
original_df_row_count = len(df)
df = df[df['currentNameFlag'] == 'Y']
new_df_row_count = len(df)
print('Dataframe record count reduced from {} to {}, a reduction of {} records'.format(original_df_row_count, new_df_row_count, original_df_row_count - new_df_row_count))
df_recent= df.loc [
(df["registrationDate"] > '1980-01-01' )
,
['registrationDate', 'status']
]
fig = px.histogram(df_recent, x=df_recent["registrationDate"].dt.year, color=df_recent['status'], labels={'x' :'Registration Year'}, title='Businesses Registered per Year')
fig.for_each_trace(lambda t: t.update(name=t.name.replace("status=", "")))
fig.show()