job-scraper/main.py

254 lines
7.6 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Job Scraper - Monitor job openings from companies you're interested in.
Usage:
python main.py # Run once
python main.py --schedule # Run daily at configured time
python main.py --list # List all tracked jobs
"""
import argparse
import sys
import time
from datetime import datetime
from pathlib import Path
import yaml
from db import Database
from notify import ChangeReport, Notifier
from scrapers import AshbyScraper, GreenhouseScraper, LeverScraper
from scrapers.base import BaseScraper, Job
from dashboard import generate_dashboard
def load_config(config_path: str = "config.yaml") -> dict:
"""Load configuration from YAML file."""
with open(config_path) as f:
return yaml.safe_load(f)
def get_scraper(company_config: dict) -> BaseScraper:
"""Create a scraper instance based on company configuration."""
platform = company_config["platform"]
name = company_config["name"]
if platform == "greenhouse":
return GreenhouseScraper(name, company_config["board_token"])
elif platform == "lever":
return LeverScraper(name, company_config["lever_company"])
elif platform == "ashby":
return AshbyScraper(name, company_config["ashby_company"])
else:
raise ValueError(f"Unknown platform: {platform}")
def filter_jobs_by_title(jobs: list[Job], title_filters: list[str]) -> list[Job]:
"""Filter jobs to only include those matching title keywords."""
if not title_filters:
return jobs
filtered = []
for job in jobs:
title_lower = job.title.lower()
if any(keyword.lower() in title_lower for keyword in title_filters):
filtered.append(job)
return filtered
def scrape_company(company_config: dict, db: Database, config: dict) -> ChangeReport:
"""Scrape jobs for a single company and detect changes."""
name = company_config["name"]
print(f"\n🔍 Scraping {name}...", end=" ", flush=True)
try:
with get_scraper(company_config) as scraper:
# Get current jobs from the career page
all_jobs = scraper.scrape()
# Filter by title keywords if configured
title_filters = config.get("title_filters", [])
current_jobs = filter_jobs_by_title(all_jobs, title_filters)
print(f"found {len(current_jobs)} jobs (of {len(all_jobs)} total)")
# Get or create company in database
company_id = db.get_or_create_company(
name,
jobs_url=company_config.get("board_token", company_config.get("lever_company", "")),
platform_type=company_config["platform"]
)
# Get stored jobs
stored_jobs = db.get_active_jobs(company_id)
# Detect changes
current_ids = {job.external_id for job in current_jobs}
stored_ids = set(stored_jobs.keys())
new_ids = current_ids - stored_ids
removed_ids = stored_ids - current_ids
# Process new jobs
new_jobs = []
for job in current_jobs:
is_new, _ = db.upsert_job(company_id, job)
if is_new:
new_jobs.append(job)
# Mark removed jobs
removed_jobs = db.mark_jobs_removed(company_id, removed_ids)
# Update last scraped time
db.update_company_scraped(company_id)
# Apply location filters to highlight relevant jobs
location_filters = config.get("location_filters", [])
if location_filters and new_jobs:
relevant_new = []
for job in new_jobs:
if job.location:
loc_lower = job.location.lower()
if any(f.lower() in loc_lower for f in location_filters):
relevant_new.append(job)
elif job.remote_type == "remote":
relevant_new.append(job)
if relevant_new:
print(f"{len(relevant_new)} jobs match your location filters!")
return ChangeReport(
company_name=name,
new_jobs=new_jobs,
removed_jobs=removed_jobs,
total_active=len(current_jobs)
)
except Exception as e:
print(f"ERROR: {e}")
return ChangeReport(
company_name=name,
new_jobs=[],
removed_jobs=[],
total_active=0
)
def run_scraper(config: dict):
"""Run the scraper for all configured companies."""
print(f"\n{'=' * 60}")
print(f"Job Scraper - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'=' * 60}")
db = Database()
notifier = Notifier(config.get("notifications", {}))
companies = config.get("companies", [])
2026-01-29 16:24:44 +00:00
# Cleanup companies no longer in config
active_names = [c["name"] for c in companies]
removed = db.cleanup_removed_companies(active_names)
if removed:
print(f"\n🧹 Removed {len(removed)} companies no longer in config: {', '.join(removed)}")
print(f"\nMonitoring {len(companies)} companies...")
reports = []
delay = config.get("scraper", {}).get("request_delay", 2)
for i, company_config in enumerate(companies):
report = scrape_company(company_config, db, config)
reports.append(report)
# Delay between companies (be respectful!)
if i < len(companies) - 1:
time.sleep(delay)
# Send notifications
notifier.notify(reports)
# Summary
total_jobs = sum(r.total_active for r in reports)
total_new = sum(len(r.new_jobs) for r in reports)
total_removed = sum(len(r.removed_jobs) for r in reports)
print(f"\n📊 Total: {total_jobs} active jobs across {len(companies)} companies")
print(f" Changes: +{total_new} new, -{total_removed} removed")
# Generate dashboard
generate_dashboard()
def list_jobs(config: dict):
"""List all tracked jobs."""
db = Database()
jobs = db.get_all_active_jobs()
if not jobs:
print("No jobs tracked yet. Run the scraper first.")
return
print(f"\n{'=' * 60}")
print(f"All Tracked Jobs ({len(jobs)} total)")
print(f"{'=' * 60}")
current_company = None
for company_name, job in jobs:
if company_name != current_company:
print(f"\n📌 {company_name}")
print("-" * 40)
current_company = company_name
location = f" [{job.location}]" if job.location else ""
remote = " 🏠" if job.remote_type == "remote" else ""
print(f"{job.title}{location}{remote}")
print(f" {job.url}")
def run_scheduled(config: dict):
"""Run the scraper on a schedule."""
import schedule
print("Starting scheduled job scraper...")
print("Will run daily at 09:00")
print("Press Ctrl+C to stop\n")
# Run immediately on start
run_scraper(config)
# Schedule daily run
schedule.every().day.at("09:00").do(run_scraper, config)
while True:
schedule.run_pending()
time.sleep(60)
def main():
parser = argparse.ArgumentParser(description="Job Scraper - Monitor job openings")
parser.add_argument("--config", default="config.yaml", help="Path to config file")
parser.add_argument("--schedule", action="store_true", help="Run on a schedule")
parser.add_argument("--list", action="store_true", help="List all tracked jobs")
args = parser.parse_args()
# Load config
config_path = Path(args.config)
if not config_path.exists():
print(f"Error: Config file not found: {config_path}")
sys.exit(1)
config = load_config(args.config)
if args.list:
list_jobs(config)
elif args.schedule:
run_scheduled(config)
else:
run_scraper(config)
if __name__ == "__main__":
main()