#!/usr/bin/env python3 """ Job Scraper - Monitor job openings from companies you're interested in. Usage: python main.py # Run once python main.py --schedule # Run daily at configured time python main.py --list # List all tracked jobs """ import argparse import sys import time from datetime import datetime from pathlib import Path import yaml from db import Database from notify import ChangeReport, Notifier from scrapers import AshbyScraper, GreenhouseScraper, LeverScraper from scrapers.base import BaseScraper, Job from dashboard import generate_dashboard def load_config(config_path: str = "config.yaml") -> dict: """Load configuration from YAML file.""" with open(config_path) as f: return yaml.safe_load(f) def get_scraper(company_config: dict) -> BaseScraper: """Create a scraper instance based on company configuration.""" platform = company_config["platform"] name = company_config["name"] if platform == "greenhouse": return GreenhouseScraper(name, company_config["board_token"]) elif platform == "lever": return LeverScraper(name, company_config["lever_company"]) elif platform == "ashby": return AshbyScraper(name, company_config["ashby_company"]) else: raise ValueError(f"Unknown platform: {platform}") def filter_jobs_by_title(jobs: list[Job], title_filters: list[str]) -> list[Job]: """Filter jobs to only include those matching title keywords.""" if not title_filters: return jobs filtered = [] for job in jobs: title_lower = job.title.lower() if any(keyword.lower() in title_lower for keyword in title_filters): filtered.append(job) return filtered def scrape_company(company_config: dict, db: Database, config: dict) -> ChangeReport: """Scrape jobs for a single company and detect changes.""" name = company_config["name"] print(f"\nšŸ” Scraping {name}...", end=" ", flush=True) try: with get_scraper(company_config) as scraper: # Get current jobs from the career page all_jobs = scraper.scrape() # Filter by title keywords if configured title_filters = config.get("title_filters", []) current_jobs = filter_jobs_by_title(all_jobs, title_filters) print(f"found {len(current_jobs)} jobs (of {len(all_jobs)} total)") # Get or create company in database company_id = db.get_or_create_company( name, jobs_url=company_config.get("board_token", company_config.get("lever_company", "")), platform_type=company_config["platform"] ) # Get stored jobs stored_jobs = db.get_active_jobs(company_id) # Detect changes current_ids = {job.external_id for job in current_jobs} stored_ids = set(stored_jobs.keys()) new_ids = current_ids - stored_ids removed_ids = stored_ids - current_ids # Process new jobs new_jobs = [] for job in current_jobs: is_new, _ = db.upsert_job(company_id, job) if is_new: new_jobs.append(job) # Mark removed jobs removed_jobs = db.mark_jobs_removed(company_id, removed_ids) # Update last scraped time db.update_company_scraped(company_id) # Apply location filters to highlight relevant jobs location_filters = config.get("location_filters", []) if location_filters and new_jobs: relevant_new = [] for job in new_jobs: if job.location: loc_lower = job.location.lower() if any(f.lower() in loc_lower for f in location_filters): relevant_new.append(job) elif job.remote_type == "remote": relevant_new.append(job) if relevant_new: print(f" ⭐ {len(relevant_new)} jobs match your location filters!") return ChangeReport( company_name=name, new_jobs=new_jobs, removed_jobs=removed_jobs, total_active=len(current_jobs) ) except Exception as e: print(f"ERROR: {e}") return ChangeReport( company_name=name, new_jobs=[], removed_jobs=[], total_active=0 ) def run_scraper(config: dict): """Run the scraper for all configured companies.""" print(f"\n{'=' * 60}") print(f"Job Scraper - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print(f"{'=' * 60}") db = Database() notifier = Notifier(config.get("notifications", {})) companies = config.get("companies", []) print(f"\nMonitoring {len(companies)} companies...") reports = [] delay = config.get("scraper", {}).get("request_delay", 2) for i, company_config in enumerate(companies): report = scrape_company(company_config, db, config) reports.append(report) # Delay between companies (be respectful!) if i < len(companies) - 1: time.sleep(delay) # Send notifications notifier.notify(reports) # Summary total_jobs = sum(r.total_active for r in reports) total_new = sum(len(r.new_jobs) for r in reports) total_removed = sum(len(r.removed_jobs) for r in reports) print(f"\nšŸ“Š Total: {total_jobs} active jobs across {len(companies)} companies") print(f" Changes: +{total_new} new, -{total_removed} removed") # Generate dashboard generate_dashboard() def list_jobs(config: dict): """List all tracked jobs.""" db = Database() jobs = db.get_all_active_jobs() if not jobs: print("No jobs tracked yet. Run the scraper first.") return print(f"\n{'=' * 60}") print(f"All Tracked Jobs ({len(jobs)} total)") print(f"{'=' * 60}") current_company = None for company_name, job in jobs: if company_name != current_company: print(f"\nšŸ“Œ {company_name}") print("-" * 40) current_company = company_name location = f" [{job.location}]" if job.location else "" remote = " šŸ " if job.remote_type == "remote" else "" print(f" • {job.title}{location}{remote}") print(f" {job.url}") def run_scheduled(config: dict): """Run the scraper on a schedule.""" import schedule print("Starting scheduled job scraper...") print("Will run daily at 09:00") print("Press Ctrl+C to stop\n") # Run immediately on start run_scraper(config) # Schedule daily run schedule.every().day.at("09:00").do(run_scraper, config) while True: schedule.run_pending() time.sleep(60) def main(): parser = argparse.ArgumentParser(description="Job Scraper - Monitor job openings") parser.add_argument("--config", default="config.yaml", help="Path to config file") parser.add_argument("--schedule", action="store_true", help="Run on a schedule") parser.add_argument("--list", action="store_true", help="List all tracked jobs") args = parser.parse_args() # Load config config_path = Path(args.config) if not config_path.exists(): print(f"Error: Config file not found: {config_path}") sys.exit(1) config = load_config(args.config) if args.list: list_jobs(config) elif args.schedule: run_scheduled(config) else: run_scraper(config) if __name__ == "__main__": main()