from abc import ABC, abstractmethod from dataclasses import dataclass from datetime import datetime from typing import Optional import httpx @dataclass class Job: """Represents a job listing.""" external_id: str title: str url: str location: Optional[str] = None department: Optional[str] = None remote_type: Optional[str] = None # 'remote', 'hybrid', 'onsite' def __hash__(self): return hash(self.external_id) def __eq__(self, other): if isinstance(other, Job): return self.external_id == other.external_id return False class BaseScraper(ABC): """Base class for all job scrapers.""" def __init__(self, company_name: str, jobs_url: str, timeout: int = 30): self.company_name = company_name self.jobs_url = jobs_url self.timeout = timeout self.client = httpx.Client( timeout=timeout, headers={ "User-Agent": "JobScraper/1.0 (Personal job search tool)" }, follow_redirects=True ) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.client.close() def fetch(self, url: Optional[str] = None) -> str: """Fetch the content from a URL.""" target_url = url or self.jobs_url response = self.client.get(target_url) response.raise_for_status() return response.text def fetch_json(self, url: Optional[str] = None) -> dict: """Fetch JSON from a URL.""" target_url = url or self.jobs_url response = self.client.get(target_url) response.raise_for_status() return response.json() @abstractmethod def scrape(self) -> list[Job]: """Scrape jobs from the company's career page. Must be implemented by subclasses.""" pass def classify_remote(self, location: str) -> Optional[str]: """Try to classify if a job is remote based on location text.""" if not location: return None location_lower = location.lower() if "remote" in location_lower: if "hybrid" in location_lower: return "hybrid" return "remote" return "onsite"