Data Scraping with Python (Splash-bs4)
2 min readOct 20, 2021
Hello for everyone!
Welcome to my medium page. Today ı will tell you how ı create a scraping code by BeautifulSoup and Splash. But you need to use some tools for scraping.
- Docker
- Splash (Open source)
- Python
from bs4 import BeautifulSoup
import requests
import csv
import datetime
url = "www.web_site.com"
page = requests.get(url)
def extract(url):
"""
Export all cryptodata from web_site.com
website
Arguments:
url (str):
url of the aimed Web_Site page
Return:
.csv file
"""
r = requests.session()
start = datetime.datetime.now()
for retry in range(10):
response = r.get(url=url)
print(response.headers)
print("-- STATUS CODE --")
print(response.status_code)
if response.status_code == 200:
with open("C:\\Users\\Username\\path\\WebScrapingCode//cryptocurrencies1_{}.csv".format(str(datetime.date.today())), "w") as f:
fieldnames = ['name', 'price', 'coin_url', '24h_%', '7d_%', 'Market_Cap', 'Volume(24h)', 'Volume(2)', 'Circulating_Supply']
writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
writer.writeheader()
soup = BeautifulSoup(page.content, 'html.parser')
cryptos = soup.find_all('table', class_="h7vnx2-2 czTsgW cmc-table")
for crypto in cryptos:
name = cryptos.find('p', class_="sc-1eb5slv-0 iworPT").extract_first()
price = cryptos.find('a', class_="cmc-link").extract_first()
coin_url = cryptos.find('a', href_="cmc-link").extract_first()
percentage_24h = cryptos.find('span', class_="sc-15yy2pl-0 hzgCfk").extract_first()
percentage_7d = cryptos.find('span', class_="sc-15yy2pl-0 kAXKAX").extract_first()
Market_Cap = cryptos.find('span', class_="sc-1ow4cwt-1 ieFnWP").extract_first()
Volume_24h = cryptos.find('p', class_="sc-1eb5slv-0 hykWbK font_weight_500").extract_first()
Volume_2 = cryptos.find('p', class_="sc-1eb5slv-0 etpvrL").extract_first()
Circulating_Supply = cryptos.find('p', class_="sc-1eb5slv-0 kZlTnE").extract_first()
clean_values = []
values = [name, price, coin_url, percentage_24h, percentage_7d, Market_Cap, Volume_24h, Volume_2, Circulating_Supply]
for value in values:
if value:
value = value.strip().replace('\n', '')
clean_values.append(value)
print(', '.join(clean_values))
dict_row = dict(zip(fieldnames, clean_values))
writer.writerow(dict_row)
else:
print("Page indisponible")
def main():
url = "www.web_site.com"
extract(url)
if __name__ == '__main__':
main()
NOTE; This code sample is for guidance purposes only. The legal responsibility for the use of the code belongs to the individuals. No liability is accepted by me.