Data scraping help the process of collecting data to conduct market research or get sales leads, optimize prices and take information and understanding of consumer needs about competitors’ products and services from websites, blogs, forums or marketplaces.
This notes is about the example of scraping in python to get product category data from https://shopee.co.id and save it in the Postgresql database.
Looking for APIs that provide product category data
-
Open Chrome and go to http://shopee.co.id
Click cursor on category and then press ctrl+shift+i to inspect page, as shown in the following image.
Click on the Network menu tab, sort the column type in the table results and look for rows of type fetch. Then find the API URL in the Name column that provides the category data in the preview.
After finding the required URL, click the Headers tab to view the URL address and copy it for further processing.
I use the postman application to provide an overview of the JSON data format generated from the URL.
.
Collecting data
-
prepare requirement installation
- install postgreSQL - install psycopg2 - install requests - install python-dotenv
-
Create an environment config class
This class will read and return environment (.env) file
from dotenv import load_dotenv import os from typing import get_type_hints, Union load_dotenv() class AppConfigError(Exception): pass def _parse_bool(val: Union[str, bool]) -> bool: # pylint: disable=E1136 return val if type(val) == bool else val.lower() in ['true', 'yes', '1'] class AppConfig: DEBUG: bool = False ENV: str = 'development' POSTGRES_DB: str POSTGRES_USER: str POSTGRES_PASSWORD: str POSTGRES_HOST: str POSTGRES_PORT: int LIMIT_RETRIES: int def __init__(self, env): for field in self.__annotations__: if not field.isupper(): continue # Raise AppConfig Error if required field not supplied default_value = getattr(self, field, None) if default_value is None and env.get(field) is None: raise AppConfigError('The {} field is required'.format(field)) # Cast env var value to expected type and raise AppConfigError on failure try: var_type = get_type_hints(AppConfig)[field] if var_type == bool: value = _parse_bool(env.get(field, default_value)) else: value = var_type(env.get(field, default_value)) self.__setattr__(field, value) except ValueError: raise AppConfigError('Unable to cast value of "{}" to type "{}" for "{}" field'.format( env[field], var_type, field ) ) def __repr__(self): return str(self.__dict__) # Expose Config object for app to import Config = AppConfig(os.environ)
-
Create Database connection class
This class used for database operation
import psycopg2 import time from db_environ import Config class Db_Connect(): def __init__(self, limit_retries, reconnect): self.param_dict = { "host" : Config.POSTGRES_HOST, "port" : Config.POSTGRES_PORT, "database" : Config.POSTGRES_DB, "user" : Config.POSTGRES_USER, "password" : Config.POSTGRES_PASSWORD } self._connection = None self._cursor = None self.reconnect = reconnect self.limit_retries = limit_retries self.ignite() def connect(self, retry_counter=0): if not self._connection: try: self._connection = psycopg2.connect(**self.param_dict) retry_counter = 0 self._connection.autocommit = False return self._connection except psycopg2.OperationalError as error: if not self.reconnect or retry_counter >= self. limit_retries: raise error else: retry_counter += 1 print("Got Error {}. reconnecting {}".format(str(error).strip(), retry_counter)) time.sleep(5) self.connect(retry_counter) except (Exception, psycopg2.Error) as error: raise error def pg_cursor(self): if not self._cursor or self._cursor.closed: if not self._connection: self.connect() self._cursor = self._connection.cursor() return self._cursor def execute(self, str_query, retry_counter=0): try: self._cursor.execute(str_query) self._connection.commit() retry_counter = 0 except (psycopg2.DatabaseError, psycopg2.OperationalError) as error: if retry_counter >= self.limit_retries: raise error else: retry_counter += 1 print("Got Error {}. reconnecting {}".format(str(error).strip(), retry_counter)) time.sleep(1) self.reset() self.execute(str_query, retry_counter) self._connection.commit() except (Exception, psycopg2.Error) as error: raise error def reset(self): self.close() self.connect() self.pg_cursor() def close(self): if self._connection: if self._cursor: self._cursor.close() self._connection.close() print('Database connection is closed') self._connection = None self._cursor = None def ignite(self): self.connect() self.pg_cursor()
-
Create class to collect category data
As we can see, in the general description of the JSON file, there are two types of data categories, namely main categories and subcategories. So I split the data collecting class for main and subcategories.
from os import environ from sys import displayhook import requests from db_connect import Db_Connect from string import Template class CollectMainCategory(): def __init__(self, urlapi): self.urlapi = urlapi self.db = Db_Connect(limit_retries=5, reconnect= True) self.cursor = self.db._cursor self.SaveToDatabase(self.Collect_Main_Category()) def Collect_Main_Category(self): url = self.urlapi resp = requests.get(url).json() category_source = resp["data"] list_values = list() for item in category_source: main_category = item.get("main") list_values += (main_category['display_name'], \ main_category['name'], \ main_category['catid'], \ main_category['parent_category'], \ str(main_category['is_adult']), \ str(main_category['block_buyer_platform']), \ main_category['sort_weight']), return list_values def SaveToDatabase(self, data): Template_SQL = ("INSERT INTO main_category(display_name,name,catid,parent_category,is_adult,block_buyer_platform, sort_weight) " "VALUES $list_value " "ON CONFLICT (catid) " "DO UPDATE SET display_name=EXCLUDED.display_name, " "name=EXCLUDED.name, parent_category=EXCLUDED.parent_category, " "is_adult=EXCLUDED.is_adult, block_buyer_platform=EXCLUDED.block_buyer_platform, " "sort_weight=EXCLUDED.sort_weight;" ) for rec in data: strSQL = Template(Template_SQL).substitute( list_value=rec ) self.db.execute(strSQL) if __name__ == "__main__": CollectMainCategory("https://shopee.co.id/api/v2/category_list/get_all")
from os import environ from sys import displayhook import requests from db_connect import Db_Connect from string import Template import re class CollectSubCategory(): def __init__(self, urlapi): self.urlapi = urlapi self.db = Db_Connect(limit_retries=5, reconnect= True) self.cursor = self.db._cursor self.SaveToDatabase(self.Collect_Sub_Category()) #self.Collect_Sub_Category() def Collect_Sub_Category(self): url = self.urlapi resp = requests.get(url).json() _source = resp["data"] list_rec = list() for dt in _source: sub_category = dt.get('sub') for rec in sub_category: sub = str(rec['sub_sub']) sub_formated = sub.replace("\'","\"") sub_formated = sub_formated.replace("None","\"None\"") list_rec += (self.TextSanitize(str(rec['display_name'])), \ self.TextSanitize(str(rec['name'])), \ rec['catid'], \ rec['parent_category'], \ str(rec['is_adult']), \ str(rec['block_buyer_platform']), \ rec['sort_weight'], sub_formated), return list_rec def TextSanitize(self, str): """Sanitizes a string so that it can be properly compiled in TeX. Escapes the most common TeX special characters: ~^_#%${} Removes backslashes. """ s = re.sub('\\\\', '', str) s = re.sub(r'([_^$%&#{}])', r'\\\1', str) s = re.sub(r'\'', r'\\~{}', str) return s def SaveToDatabase(self, data): Template_SQL = ("INSERT INTO sub_category(display_name,name,catid,parent_category,is_adult,block_buyer_platform, sort_weight, sub_sub) " "VALUES $list_recs " "ON CONFLICT (catid) " "DO UPDATE SET display_name=EXCLUDED.display_name, " "name=EXCLUDED.name, parent_category=EXCLUDED.parent_category, " "is_adult=EXCLUDED.is_adult, block_buyer_platform=EXCLUDED.block_buyer_platform, " "sort_weight=EXCLUDED.sort_weight, " "sub_sub=EXCLUDED.sub_sub;" ) for rec in data: strSQL = Template(Template_SQL).substitute( list_recs=rec ) self.db.execute(strSQL) if __name__ == "__main__": CollectSubCategory("https://shopee.co.id/api/v2/category_list/get_all")
-
Scraping Result
Here is the result of scraping product category from shopee.co.id. These results can be used for further processing later