- Scrapes job listings from Greenhouse, Lever, and Ashby platforms - Tracks 14 companies (1Password, DuckDuckGo, GitLab, etc.) - SQLite database for change detection - Filters by engineering job titles and location preferences - Generates static HTML dashboard with search/filter - Docker support for deployment to Debian server
76 lines
2.2 KiB
Python
76 lines
2.2 KiB
Python
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
import httpx
|
|
|
|
|
|
@dataclass
|
|
class Job:
|
|
"""Represents a job listing."""
|
|
external_id: str
|
|
title: str
|
|
url: str
|
|
location: Optional[str] = None
|
|
department: Optional[str] = None
|
|
remote_type: Optional[str] = None # 'remote', 'hybrid', 'onsite'
|
|
|
|
def __hash__(self):
|
|
return hash(self.external_id)
|
|
|
|
def __eq__(self, other):
|
|
if isinstance(other, Job):
|
|
return self.external_id == other.external_id
|
|
return False
|
|
|
|
|
|
class BaseScraper(ABC):
|
|
"""Base class for all job scrapers."""
|
|
|
|
def __init__(self, company_name: str, jobs_url: str, timeout: int = 30):
|
|
self.company_name = company_name
|
|
self.jobs_url = jobs_url
|
|
self.timeout = timeout
|
|
self.client = httpx.Client(
|
|
timeout=timeout,
|
|
headers={
|
|
"User-Agent": "JobScraper/1.0 (Personal job search tool)"
|
|
},
|
|
follow_redirects=True
|
|
)
|
|
|
|
def __enter__(self):
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
self.client.close()
|
|
|
|
def fetch(self, url: Optional[str] = None) -> str:
|
|
"""Fetch the content from a URL."""
|
|
target_url = url or self.jobs_url
|
|
response = self.client.get(target_url)
|
|
response.raise_for_status()
|
|
return response.text
|
|
|
|
def fetch_json(self, url: Optional[str] = None) -> dict:
|
|
"""Fetch JSON from a URL."""
|
|
target_url = url or self.jobs_url
|
|
response = self.client.get(target_url)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
|
|
@abstractmethod
|
|
def scrape(self) -> list[Job]:
|
|
"""Scrape jobs from the company's career page. Must be implemented by subclasses."""
|
|
pass
|
|
|
|
def classify_remote(self, location: str) -> Optional[str]:
|
|
"""Try to classify if a job is remote based on location text."""
|
|
if not location:
|
|
return None
|
|
location_lower = location.lower()
|
|
if "remote" in location_lower:
|
|
if "hybrid" in location_lower:
|
|
return "hybrid"
|
|
return "remote"
|
|
return "onsite"
|