job-scraper/scrapers/base.py

from abc import ABC, abstractmethod
from dataclasses import dataclass
from datetime import datetime
from typing import Optional
import httpx


@dataclass
class Job:
    """Represents a job listing."""
    external_id: str
    title: str
    url: str
    location: Optional[str] = None
    department: Optional[str] = None
    remote_type: Optional[str] = None  # 'remote', 'hybrid', 'onsite'

    def __hash__(self):
        return hash(self.external_id)

    def __eq__(self, other):
        if isinstance(other, Job):
            return self.external_id == other.external_id
        return False


class BaseScraper(ABC):
    """Base class for all job scrapers."""

    def __init__(self, company_name: str, jobs_url: str, timeout: int = 30):
        self.company_name = company_name
        self.jobs_url = jobs_url
        self.timeout = timeout
        self.client = httpx.Client(
            timeout=timeout,
            headers={
                "User-Agent": "JobScraper/1.0 (Personal job search tool)"
            },
            follow_redirects=True
        )

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.client.close()

    def fetch(self, url: Optional[str] = None) -> str:
        """Fetch the content from a URL."""
        target_url = url or self.jobs_url
        response = self.client.get(target_url)
        response.raise_for_status()
        return response.text

    def fetch_json(self, url: Optional[str] = None) -> dict:
        """Fetch JSON from a URL."""
        target_url = url or self.jobs_url
        response = self.client.get(target_url)
        response.raise_for_status()
        return response.json()

    @abstractmethod
    def scrape(self) -> list[Job]:
        """Scrape jobs from the company's career page. Must be implemented by subclasses."""
        pass

    def classify_remote(self, location: str) -> Optional[str]:
        """Try to classify if a job is remote based on location text."""
        if not location:
            return None
        location_lower = location.lower()
        if "remote" in location_lower:
            if "hybrid" in location_lower:
                return "hybrid"
            return "remote"
        return "onsite"
Initial commit: Job scraper for privacy/open-source companies - Scrapes job listings from Greenhouse, Lever, and Ashby platforms - Tracks 14 companies (1Password, DuckDuckGo, GitLab, etc.) - SQLite database for change detection - Filters by engineering job titles and location preferences - Generates static HTML dashboard with search/filter - Docker support for deployment to Debian server 2026-01-20 16:40:08 +00:00			`from abc import ABC, abstractmethod`
			`from dataclasses import dataclass`
			`from datetime import datetime`
			`from typing import Optional`
			`import httpx`


			`@dataclass`
			`class Job:`
			`"""Represents a job listing."""`
			`external_id: str`
			`title: str`
			`url: str`
			`location: Optional[str] = None`
			`department: Optional[str] = None`
			`remote_type: Optional[str] = None # 'remote', 'hybrid', 'onsite'`

			`def __hash__(self):`
			`return hash(self.external_id)`

			`def __eq__(self, other):`
			`if isinstance(other, Job):`
			`return self.external_id == other.external_id`
			`return False`


			`class BaseScraper(ABC):`
			`"""Base class for all job scrapers."""`

			`def __init__(self, company_name: str, jobs_url: str, timeout: int = 30):`
			`self.company_name = company_name`
			`self.jobs_url = jobs_url`
			`self.timeout = timeout`
			`self.client = httpx.Client(`
			`timeout=timeout,`
			`headers={`
			`"User-Agent": "JobScraper/1.0 (Personal job search tool)"`
			`},`
			`follow_redirects=True`
			`)`

			`def __enter__(self):`
			`return self`

			`def __exit__(self, exc_type, exc_val, exc_tb):`
			`self.client.close()`

			`def fetch(self, url: Optional[str] = None) -> str:`
			`"""Fetch the content from a URL."""`
			`target_url = url or self.jobs_url`
			`response = self.client.get(target_url)`
			`response.raise_for_status()`
			`return response.text`

			`def fetch_json(self, url: Optional[str] = None) -> dict:`
			`"""Fetch JSON from a URL."""`
			`target_url = url or self.jobs_url`
			`response = self.client.get(target_url)`
			`response.raise_for_status()`
			`return response.json()`

			`@abstractmethod`
			`def scrape(self) -> list[Job]:`
			`"""Scrape jobs from the company's career page. Must be implemented by subclasses."""`
			`pass`

			`def classify_remote(self, location: str) -> Optional[str]:`
			`"""Try to classify if a job is remote based on location text."""`
			`if not location:`
			`return None`
			`location_lower = location.lower()`
			`if "remote" in location_lower:`
			`if "hybrid" in location_lower:`
			`return "hybrid"`
			`return "remote"`
			`return "onsite"`