job-scraper/scrapers/base.py

77 lines
2.2 KiB
Python
Raw Normal View History

from abc import ABC, abstractmethod
from dataclasses import dataclass
from datetime import datetime
from typing import Optional
import httpx
@dataclass
class Job:
"""Represents a job listing."""
external_id: str
title: str
url: str
location: Optional[str] = None
department: Optional[str] = None
remote_type: Optional[str] = None # 'remote', 'hybrid', 'onsite'
def __hash__(self):
return hash(self.external_id)
def __eq__(self, other):
if isinstance(other, Job):
return self.external_id == other.external_id
return False
class BaseScraper(ABC):
"""Base class for all job scrapers."""
def __init__(self, company_name: str, jobs_url: str, timeout: int = 30):
self.company_name = company_name
self.jobs_url = jobs_url
self.timeout = timeout
self.client = httpx.Client(
timeout=timeout,
headers={
"User-Agent": "JobScraper/1.0 (Personal job search tool)"
},
follow_redirects=True
)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.client.close()
def fetch(self, url: Optional[str] = None) -> str:
"""Fetch the content from a URL."""
target_url = url or self.jobs_url
response = self.client.get(target_url)
response.raise_for_status()
return response.text
def fetch_json(self, url: Optional[str] = None) -> dict:
"""Fetch JSON from a URL."""
target_url = url or self.jobs_url
response = self.client.get(target_url)
response.raise_for_status()
return response.json()
@abstractmethod
def scrape(self) -> list[Job]:
"""Scrape jobs from the company's career page. Must be implemented by subclasses."""
pass
def classify_remote(self, location: str) -> Optional[str]:
"""Try to classify if a job is remote based on location text."""
if not location:
return None
location_lower = location.lower()
if "remote" in location_lower:
if "hybrid" in location_lower:
return "hybrid"
return "remote"
return "onsite"