MWCバルセロナ2019に行くことになったので出展社を調査しようと思ったのですが、サイト上で調べるのは面倒だったので、csvファイルにするスクリプトを作成。
mwc_Exhibitors.py
import requests
def main():
#Data収集
num=0
exbitor_data=[]
print 'start get company data'
while 1:
response = requests.get('https://www.mwcbarcelona.com/exhibition/2019-exhibitors/page/'+str(num)+'/',
headers={'Content-Type': 'text/html; charset=UTF-8'})
if response.content.find("No exhibitor found.")!=-1:
break
for line in response.content.split("\n"):
if line.find('data-type="exhibitor"')!=-1:
company_url = line.split('"')[1]
elif line.find('class="box-title"')!=-1:
company_name = line.split('>')[1].split("<")[0]
exbitor_data.append([company_name,company_url])
num=num+1
num=0
for company in exbitor_data:
print str(num+1) + "/" + str(len(exbitor_data))
response = requests.get(company[1],
headers={'Content-Type': 'text/html; charset=UTF-8'})
dsr_flg=0
company_location='none'
company_description='none'
company_website='none'
company_tags='none'
for line in response.content.split("\n"):
if line.find('class="list-location">')!=-1:
company_location = line.split('class="list-location">')[1].split("<")[0]
elif line.find('class="mod-content api-description"')!=-1:
dsr_flg=1
company_description=''
elif dsr_flg:
if line.find('</div>')!=-1:
company_description = company_description.split("<p>")[1].replace("<br />","").replace("</p>","")
dsr_flg=0
else:
company_description=company_description+line
elif line.find('class="web-site-link"')!=-1:
company_website = line.split('"')[1]
elif line.find('name="tags"')!=-1:
company_tags = ''
for tag in line.split('name="tags"'):
if tag.find('</span>')!=-1:
company_tags = company_tags+tag.split('</label')[0].split('>')[-1]+','
exbitor_data[num].extend([company_website,company_description,company_location,company_tags])
num=num+1
fp_csv = open("mwc_exhbitor_list.csv","w")
fp_csv.write("Company,website,description,tags,location\n")
for company in exbitor_data:
fp_csv.write('"'+company[0]+'","'+company[2]+'","'+company[3]+'","'+company[5]+'","'+company[4]+'"\n')
return
if __name__ == '__main__':
main()