job-scraper/main.py

#!/usr/bin/env python3
"""
Job Scraper - Monitor job openings from companies you're interested in.

Usage:
    python main.py              # Run once
    python main.py --schedule   # Run daily at configured time
    python main.py --list       # List all tracked jobs
"""

import argparse
import sys
import time
from datetime import datetime
from pathlib import Path

import yaml

from db import Database
from notify import ChangeReport, Notifier
from scrapers import AshbyScraper, GreenhouseScraper, LeverScraper
from scrapers.base import BaseScraper, Job
from dashboard import generate_dashboard


def load_config(config_path: str = "config.yaml") -> dict:
    """Load configuration from YAML file."""
    with open(config_path) as f:
        return yaml.safe_load(f)


def get_scraper(company_config: dict) -> BaseScraper:
    """Create a scraper instance based on company configuration."""
    platform = company_config["platform"]
    name = company_config["name"]

    if platform == "greenhouse":
        return GreenhouseScraper(name, company_config["board_token"])
    elif platform == "lever":
        return LeverScraper(name, company_config["lever_company"])
    elif platform == "ashby":
        return AshbyScraper(name, company_config["ashby_company"])
    else:
        raise ValueError(f"Unknown platform: {platform}")


def filter_jobs_by_title(jobs: list[Job], title_filters: list[str]) -> list[Job]:
    """Filter jobs to only include those matching title keywords."""
    if not title_filters:
        return jobs

    filtered = []
    for job in jobs:
        title_lower = job.title.lower()
        if any(keyword.lower() in title_lower for keyword in title_filters):
            filtered.append(job)
    return filtered


def scrape_company(company_config: dict, db: Database, config: dict) -> ChangeReport:
    """Scrape jobs for a single company and detect changes."""
    name = company_config["name"]
    print(f"\n🔍 Scraping {name}...", end=" ", flush=True)

    try:
        with get_scraper(company_config) as scraper:
            # Get current jobs from the career page
            all_jobs = scraper.scrape()

        # Filter by title keywords if configured
        title_filters = config.get("title_filters", [])
        current_jobs = filter_jobs_by_title(all_jobs, title_filters)

        print(f"found {len(current_jobs)} jobs (of {len(all_jobs)} total)")

        # Get or create company in database
        company_id = db.get_or_create_company(
            name,
            jobs_url=company_config.get("board_token", company_config.get("lever_company", "")),
            platform_type=company_config["platform"]
        )

        # Get stored jobs
        stored_jobs = db.get_active_jobs(company_id)

        # Detect changes
        current_ids = {job.external_id for job in current_jobs}
        stored_ids = set(stored_jobs.keys())

        new_ids = current_ids - stored_ids
        removed_ids = stored_ids - current_ids

        # Process new jobs
        new_jobs = []
        for job in current_jobs:
            is_new, _ = db.upsert_job(company_id, job)
            if is_new:
                new_jobs.append(job)

        # Mark removed jobs
        removed_jobs = db.mark_jobs_removed(company_id, removed_ids)

        # Update last scraped time
        db.update_company_scraped(company_id)

        # Apply location filters to highlight relevant jobs
        location_filters = config.get("location_filters", [])
        if location_filters and new_jobs:
            relevant_new = []
            for job in new_jobs:
                if job.location:
                    loc_lower = job.location.lower()
                    if any(f.lower() in loc_lower for f in location_filters):
                        relevant_new.append(job)
                elif job.remote_type == "remote":
                    relevant_new.append(job)

            if relevant_new:
                print(f"   ⭐ {len(relevant_new)} jobs match your location filters!")

        return ChangeReport(
            company_name=name,
            new_jobs=new_jobs,
            removed_jobs=removed_jobs,
            total_active=len(current_jobs)
        )

    except Exception as e:
        print(f"ERROR: {e}")
        return ChangeReport(
            company_name=name,
            new_jobs=[],
            removed_jobs=[],
            total_active=0
        )


def run_scraper(config: dict):
    """Run the scraper for all configured companies."""
    print(f"\n{'=' * 60}")
    print(f"Job Scraper - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"{'=' * 60}")

    db = Database()
    notifier = Notifier(config.get("notifications", {}))

    companies = config.get("companies", [])

    # Cleanup companies no longer in config
    active_names = [c["name"] for c in companies]
    removed = db.cleanup_removed_companies(active_names)
    if removed:
        print(f"\n🧹 Removed {len(removed)} companies no longer in config: {', '.join(removed)}")

    print(f"\nMonitoring {len(companies)} companies...")

    reports = []
    delay = config.get("scraper", {}).get("request_delay", 2)

    for i, company_config in enumerate(companies):
        report = scrape_company(company_config, db, config)
        reports.append(report)

        # Delay between companies (be respectful!)
        if i < len(companies) - 1:
            time.sleep(delay)

    # Send notifications
    notifier.notify(reports)

    # Summary
    total_jobs = sum(r.total_active for r in reports)
    total_new = sum(len(r.new_jobs) for r in reports)
    total_removed = sum(len(r.removed_jobs) for r in reports)

    print(f"\n📊 Total: {total_jobs} active jobs across {len(companies)} companies")
    print(f"   Changes: +{total_new} new, -{total_removed} removed")

    # Generate dashboard
    generate_dashboard()


def list_jobs(config: dict):
    """List all tracked jobs."""
    db = Database()
    jobs = db.get_all_active_jobs()

    if not jobs:
        print("No jobs tracked yet. Run the scraper first.")
        return

    print(f"\n{'=' * 60}")
    print(f"All Tracked Jobs ({len(jobs)} total)")
    print(f"{'=' * 60}")

    current_company = None
    for company_name, job in jobs:
        if company_name != current_company:
            print(f"\n📌 {company_name}")
            print("-" * 40)
            current_company = company_name

        location = f" [{job.location}]" if job.location else ""
        remote = " 🏠" if job.remote_type == "remote" else ""
        print(f"  • {job.title}{location}{remote}")
        print(f"    {job.url}")


def run_scheduled(config: dict):
    """Run the scraper on a schedule."""
    import schedule

    print("Starting scheduled job scraper...")
    print("Will run daily at 09:00")
    print("Press Ctrl+C to stop\n")

    # Run immediately on start
    run_scraper(config)

    # Schedule daily run
    schedule.every().day.at("09:00").do(run_scraper, config)

    while True:
        schedule.run_pending()
        time.sleep(60)


def main():
    parser = argparse.ArgumentParser(description="Job Scraper - Monitor job openings")
    parser.add_argument("--config", default="config.yaml", help="Path to config file")
    parser.add_argument("--schedule", action="store_true", help="Run on a schedule")
    parser.add_argument("--list", action="store_true", help="List all tracked jobs")

    args = parser.parse_args()

    # Load config
    config_path = Path(args.config)
    if not config_path.exists():
        print(f"Error: Config file not found: {config_path}")
        sys.exit(1)

    config = load_config(args.config)

    if args.list:
        list_jobs(config)
    elif args.schedule:
        run_scheduled(config)
    else:
        run_scraper(config)


if __name__ == "__main__":
    main()
Initial commit: Job scraper for privacy/open-source companies - Scrapes job listings from Greenhouse, Lever, and Ashby platforms - Tracks 14 companies (1Password, DuckDuckGo, GitLab, etc.) - SQLite database for change detection - Filters by engineering job titles and location preferences - Generates static HTML dashboard with search/filter - Docker support for deployment to Debian server 2026-01-20 16:40:08 +00:00			`#!/usr/bin/env python3`
			`"""`
			`Job Scraper - Monitor job openings from companies you're interested in.`

			`Usage:`
			`python main.py # Run once`
			`python main.py --schedule # Run daily at configured time`
			`python main.py --list # List all tracked jobs`
			`"""`

			`import argparse`
			`import sys`
			`import time`
			`from datetime import datetime`
			`from pathlib import Path`

			`import yaml`

			`from db import Database`
			`from notify import ChangeReport, Notifier`
			`from scrapers import AshbyScraper, GreenhouseScraper, LeverScraper`
			`from scrapers.base import BaseScraper, Job`
			`from dashboard import generate_dashboard`


			`def load_config(config_path: str = "config.yaml") -> dict:`
			`"""Load configuration from YAML file."""`
			`with open(config_path) as f:`
			`return yaml.safe_load(f)`


			`def get_scraper(company_config: dict) -> BaseScraper:`
			`"""Create a scraper instance based on company configuration."""`
			`platform = company_config["platform"]`
			`name = company_config["name"]`

			`if platform == "greenhouse":`
			`return GreenhouseScraper(name, company_config["board_token"])`
			`elif platform == "lever":`
			`return LeverScraper(name, company_config["lever_company"])`
			`elif platform == "ashby":`
			`return AshbyScraper(name, company_config["ashby_company"])`
			`else:`
			`raise ValueError(f"Unknown platform: {platform}")`


			`def filter_jobs_by_title(jobs: list[Job], title_filters: list[str]) -> list[Job]:`
			`"""Filter jobs to only include those matching title keywords."""`
			`if not title_filters:`
			`return jobs`

			`filtered = []`
			`for job in jobs:`
			`title_lower = job.title.lower()`
			`if any(keyword.lower() in title_lower for keyword in title_filters):`
			`filtered.append(job)`
			`return filtered`


			`def scrape_company(company_config: dict, db: Database, config: dict) -> ChangeReport:`
			`"""Scrape jobs for a single company and detect changes."""`
			`name = company_config["name"]`
			`print(f"\n🔍 Scraping {name}...", end=" ", flush=True)`

			`try:`
			`with get_scraper(company_config) as scraper:`
			`# Get current jobs from the career page`
			`all_jobs = scraper.scrape()`

			`# Filter by title keywords if configured`
			`title_filters = config.get("title_filters", [])`
			`current_jobs = filter_jobs_by_title(all_jobs, title_filters)`

			`print(f"found {len(current_jobs)} jobs (of {len(all_jobs)} total)")`

			`# Get or create company in database`
			`company_id = db.get_or_create_company(`
			`name,`
			`jobs_url=company_config.get("board_token", company_config.get("lever_company", "")),`
			`platform_type=company_config["platform"]`
			`)`

			`# Get stored jobs`
			`stored_jobs = db.get_active_jobs(company_id)`

			`# Detect changes`
			`current_ids = {job.external_id for job in current_jobs}`
			`stored_ids = set(stored_jobs.keys())`

			`new_ids = current_ids - stored_ids`
			`removed_ids = stored_ids - current_ids`

			`# Process new jobs`
			`new_jobs = []`
			`for job in current_jobs:`
			`is_new, _ = db.upsert_job(company_id, job)`
			`if is_new:`
			`new_jobs.append(job)`

			`# Mark removed jobs`
			`removed_jobs = db.mark_jobs_removed(company_id, removed_ids)`

			`# Update last scraped time`
			`db.update_company_scraped(company_id)`

			`# Apply location filters to highlight relevant jobs`
			`location_filters = config.get("location_filters", [])`
			`if location_filters and new_jobs:`
			`relevant_new = []`
			`for job in new_jobs:`
			`if job.location:`
			`loc_lower = job.location.lower()`
			`if any(f.lower() in loc_lower for f in location_filters):`
			`relevant_new.append(job)`
			`elif job.remote_type == "remote":`
			`relevant_new.append(job)`

			`if relevant_new:`
			`print(f" ⭐ {len(relevant_new)} jobs match your location filters!")`

			`return ChangeReport(`
			`company_name=name,`
			`new_jobs=new_jobs,`
			`removed_jobs=removed_jobs,`
			`total_active=len(current_jobs)`
			`)`

			`except Exception as e:`
			`print(f"ERROR: {e}")`
			`return ChangeReport(`
			`company_name=name,`
			`new_jobs=[],`
			`removed_jobs=[],`
			`total_active=0`
			`)`


			`def run_scraper(config: dict):`
			`"""Run the scraper for all configured companies."""`
			`print(f"\n{'=' * 60}")`
			`print(f"Job Scraper - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")`
			`print(f"{'=' * 60}")`

			`db = Database()`
			`notifier = Notifier(config.get("notifications", {}))`

			`companies = config.get("companies", [])`
Update filters and add cleanup 2026-01-29 16:24:44 +00:00
			`# Cleanup companies no longer in config`
			`active_names = [c["name"] for c in companies]`
			`removed = db.cleanup_removed_companies(active_names)`
			`if removed:`
			`print(f"\n🧹 Removed {len(removed)} companies no longer in config: {', '.join(removed)}")`

Initial commit: Job scraper for privacy/open-source companies - Scrapes job listings from Greenhouse, Lever, and Ashby platforms - Tracks 14 companies (1Password, DuckDuckGo, GitLab, etc.) - SQLite database for change detection - Filters by engineering job titles and location preferences - Generates static HTML dashboard with search/filter - Docker support for deployment to Debian server 2026-01-20 16:40:08 +00:00			`print(f"\nMonitoring {len(companies)} companies...")`

			`reports = []`
			`delay = config.get("scraper", {}).get("request_delay", 2)`

			`for i, company_config in enumerate(companies):`
			`report = scrape_company(company_config, db, config)`
			`reports.append(report)`

			`# Delay between companies (be respectful!)`
			`if i < len(companies) - 1:`
			`time.sleep(delay)`

			`# Send notifications`
			`notifier.notify(reports)`

			`# Summary`
			`total_jobs = sum(r.total_active for r in reports)`
			`total_new = sum(len(r.new_jobs) for r in reports)`
			`total_removed = sum(len(r.removed_jobs) for r in reports)`

			`print(f"\n📊 Total: {total_jobs} active jobs across {len(companies)} companies")`
			`print(f" Changes: +{total_new} new, -{total_removed} removed")`

			`# Generate dashboard`
			`generate_dashboard()`


			`def list_jobs(config: dict):`
			`"""List all tracked jobs."""`
			`db = Database()`
			`jobs = db.get_all_active_jobs()`

			`if not jobs:`
			`print("No jobs tracked yet. Run the scraper first.")`
			`return`

			`print(f"\n{'=' * 60}")`
			`print(f"All Tracked Jobs ({len(jobs)} total)")`
			`print(f"{'=' * 60}")`

			`current_company = None`
			`for company_name, job in jobs:`
			`if company_name != current_company:`
			`print(f"\n📌 {company_name}")`
			`print("-" * 40)`
			`current_company = company_name`

			`location = f" [{job.location}]" if job.location else ""`
			`remote = " 🏠" if job.remote_type == "remote" else ""`
			`print(f" • {job.title}{location}{remote}")`
			`print(f" {job.url}")`


			`def run_scheduled(config: dict):`
			`"""Run the scraper on a schedule."""`
			`import schedule`

			`print("Starting scheduled job scraper...")`
			`print("Will run daily at 09:00")`
			`print("Press Ctrl+C to stop\n")`

			`# Run immediately on start`
			`run_scraper(config)`

			`# Schedule daily run`
			`schedule.every().day.at("09:00").do(run_scraper, config)`

			`while True:`
			`schedule.run_pending()`
			`time.sleep(60)`


			`def main():`
			`parser = argparse.ArgumentParser(description="Job Scraper - Monitor job openings")`
			`parser.add_argument("--config", default="config.yaml", help="Path to config file")`
			`parser.add_argument("--schedule", action="store_true", help="Run on a schedule")`
			`parser.add_argument("--list", action="store_true", help="List all tracked jobs")`

			`args = parser.parse_args()`

			`# Load config`
			`config_path = Path(args.config)`
			`if not config_path.exists():`
			`print(f"Error: Config file not found: {config_path}")`
			`sys.exit(1)`

			`config = load_config(args.config)`

			`if args.list:`
			`list_jobs(config)`
			`elif args.schedule:`
			`run_scheduled(config)`
			`else:`
			`run_scraper(config)`


			`if __name__ == "__main__":`
			`main()`