The basic steps involved in web scraping are:
1) Loading the document (HTML content)
2) Parsing the document
3) Extraction
4) Transformation
- Import necessary libraries (requests, BeautifulSoup, re, matplotlib.pyplot).
- Define convert_price_to_float(price) Function: to Remove non-numeric characters from a price string and convert it to a float.
- Define get_amazon_products(search_query) Function: to Scrape Amazon for product information based on the search query.
- Fetch and parse the HTML content then Extract product names and prices from the search results and Sort product information based on converted prices in ascending order.
- Return sorted product data as a list of dictionaries.
- Call get_amazon_products(search_query) to get product data based on the user's search query.
- Check if products are found; if not, display "No products found."
- Visualize Product Data using a Bar Chart
import requests
from bs4 import BeautifulSoup
import re
import matplotlib.pyplot as plt
def convert_price_to_float(price):
# Remove currency symbols and commas, and then convert to float
price = re.sub(r'[^\d.]', '', price) # Remove non-digit characters except '.'
return float(price) if price else 0.0
def get_amazon_products(search_query):
base_url = 'https://www.amazon.in'
headers = {
'User-Agent': 'Your User Agent' # Add your User Agent here
}
search_query = search_query.replace(' ', '+')
url = f'{base_url}/s?k={search_query}'
response = requests.get(url, headers=headers)
products_data = [] # List to store product information
if response.status_code == 200:
/* TYPE YOUR CODE HERE
return sorted(products_data, key=lambda x: convert_price_to_float(x['Price']))
search_query = input('Enter product to search on Amazon: ')
products = get_amazon_products(search_query)
# Displaying product data using a bar chart
if products: # Check if products list is not empty
product_names = [product['Product'][:30] if len(product['Product']) > 30 else product['Product'] for product in products]
product_prices = [convert_price_to_float(product['Price']) for product in products]
plt.figure(figsize=(10, 6))
plt.barh(range(len(product_prices)), product_prices, color='skyblue')
plt.xlabel('Price')
plt.ylabel('Product')
plt.title(f'Products and their Prices on Amazon for {search_query.capitalize()} (Ascending Order)')
plt.yticks(range(len(product_prices)), product_names) # Setting y-axis labels as shortened product names
plt.tight_layout()
plt.show()
else:
print('No products found.')