import pandas as pd
import numpy as np
CompustatData = pd.read_csv('./data/compustat_data.csv')
CompustatData=CompustatData[['pi','txt','txpd','gvkey','fyear','naics','fic']]
CompustatData=CompustatData.dropna()
CompustatData.columns
years=[2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016]
CompustatData=CompustatData[(CompustatData.fyear>1995) & (CompustatData.fyear<2019)]
CompustatData.fyear=CompustatData.fyear.astype(int)
CompustatData = CompustatData.drop_duplicates(subset = ['gvkey', 'fyear']) #this is not ok.
#CompustatData = CompustatData.sort_values(['gvkey', 'fyear'])
#CompustatData = CompustatData.set_index(['gvkey', 'fyear'], verify_integrity = True, drop = False)
CompustatData = CompustatData[CompustatData['fic'] == 'USA']
CompustatData=CompustatData[CompustatData['fyear'].isin( years)]
naics_labels=['Agriculture, Forestry, Fishing and Hunting',\
'Mining, Quarrying, and Oil and Gas Extraction',\
'Utilities',\
'Construction',\
'NA1',\
'Manufacturing',\
'NA2',\
'Wholesale Trade',\
'Retail Trade',\
'NA3',\
'Transportation and Warehousing',\
'NA4',\
'Information',\
'Finance and Insurance',\
'Real Estate and Rental and Leasing',\
'Professional, Scientific, and Technical Services',\
'Management of Companies and Enterprises',\
'Administrative and Support and Waste Management and Remediation Services',\
'Educational Services',\
'Health Care and Social Assistance',\
'Arts, Entertainment, and Recreation',\
'Accommodation and Food Service',\
'Other Services','Public administration']
naics_code=[11,21,22,23,24,31,35,42,44,46,48,50,51,52,53,54,55,56,61,62,71,72,81,83,92]
n=24
#naics_labels=['Ag. Mining and Utilities',\
# 'Consts and Mfg',\
# 'Trade and warehousing',\
# 'Information and FIRE',\
# 'RE',\
# 'Professional services',\
# 'Nonprof services']
#naics_code=[11,23,42,51,53,54,57,92]
CompustatData['naics_label']=pd.cut(CompustatData['naics']/10**4,naics_code[:n+1],labels=naics_labels[:n],include_lowest=True,right=False)
CompustatData['naics_code_cat']=pd.cut(CompustatData['naics']/10**4,naics_code[:n+1],include_lowest=True,right=False)
CompustatData['naics_2d']=pd.cut(CompustatData['naics']/10**4,naics_code[:n+1],labels=naics_code[:n],include_lowest=True,right=False)
temp=pd.pivot_table(CompustatData, values=['txt','pi'], columns=['fyear'], aggfunc=np.mean).T
temp2=pd.DataFrame((temp.txt/temp.pi).values)
temp2=temp2.T
temp2.columns=years
temp2.index=['all']
naics_year_count = pd.pivot_table(CompustatData, values='gvkey', index=['naics_label'], columns=['fyear'], aggfunc=len)
naics_year_pi = pd.pivot_table(CompustatData, values='pi', index=['naics_label'], columns=['fyear'], aggfunc=np.sum)
naics_year_txt =pd.pivot_table(CompustatData, values='txt', index=['naics_label'], columns=['fyear'], aggfunc=np.sum)
naics_year_txpd =pd.pivot_table(CompustatData, values='txpd', index=['naics_label'], columns=['fyear'], aggfunc=np.sum)
naics_year_etr_txt=naics_year_txt/naics_year_pi
naics_year_etr_txpd=naics_year_txpd/naics_year_pi
naics_year_etr_txt=pd.concat([temp2,naics_year_etr_txt])
temp=pd.pivot_table(CompustatData, values=['txpd','pi'], columns=['fyear'], aggfunc=np.mean).T
temp2=pd.DataFrame((temp.txpd/temp.pi).values)
temp2=temp2.T
temp2.columns=years
temp2.index=['all']
naics_year_etr_txpd = pd.concat([temp2,naics_year_etr_txpd])
naics_year_count
naics_year_etr_txt.T.describe().T
naics_year_etr_txt
naics_year_etr_txpd
naics_year_etr_txpd.T.describe().T