From e8eb9d3fcf58727a5d31f02e9cbada7545238a09 Mon Sep 17 00:00:00 2001 From: Bastian Gruber Date: Tue, 20 Jan 2026 12:40:08 -0400 Subject: [PATCH] Initial commit: Job scraper for privacy/open-source companies - Scrapes job listings from Greenhouse, Lever, and Ashby platforms - Tracks 14 companies (1Password, DuckDuckGo, GitLab, etc.) - SQLite database for change detection - Filters by engineering job titles and location preferences - Generates static HTML dashboard with search/filter - Docker support for deployment to Debian server --- .gitignore | 167 ++---------------- Dockerfile | 16 ++ README.md | 133 +++++++++++++- config.yaml | 116 +++++++++++++ dashboard.py | 385 +++++++++++++++++++++++++++++++++++++++++ db.py | 238 +++++++++++++++++++++++++ docker-compose.yaml | 35 ++++ main.py | 246 ++++++++++++++++++++++++++ nginx.conf | 24 +++ notify.py | 178 +++++++++++++++++++ requirements.txt | 5 + scrapers/__init__.py | 6 + scrapers/ashby.py | 51 ++++++ scrapers/base.py | 76 ++++++++ scrapers/greenhouse.py | 42 +++++ scrapers/lever.py | 50 ++++++ 16 files changed, 1613 insertions(+), 155 deletions(-) create mode 100644 Dockerfile create mode 100644 config.yaml create mode 100644 dashboard.py create mode 100644 db.py create mode 100644 docker-compose.yaml create mode 100644 main.py create mode 100644 nginx.conf create mode 100644 notify.py create mode 100644 requirements.txt create mode 100644 scrapers/__init__.py create mode 100644 scrapers/ashby.py create mode 100644 scrapers/base.py create mode 100644 scrapers/greenhouse.py create mode 100644 scrapers/lever.py diff --git a/.gitignore b/.gitignore index ab3e8ce..6f9e776 100644 --- a/.gitignore +++ b/.gitignore @@ -1,164 +1,25 @@ -# ---> Python -# Byte-compiled / optimized / DLL files +# Python __pycache__/ *.py[cod] *$py.class - -# C extensions *.so - -# Distribution / packaging .Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/latest/usage/project/#working-with-version-control -.pdm.toml -.pdm-python -.pdm-build/ - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ venv/ +.venv/ ENV/ -env.bak/ -venv.bak/ -# Spyder project settings -.spyderproject -.spyproject +# Data +data/*.db -# Rope project settings -.ropeproject +# IDE +.idea/ +.vscode/ +*.swp +*.swo -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +# OS +.DS_Store +Thumbs.db +# Secrets (if you add email credentials) +.env diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..9b9bdb3 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.12-slim + +WORKDIR /app + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY . . + +# Create data directory for SQLite database +RUN mkdir -p /app/data + +# Run the scraper +CMD ["python", "main.py"] diff --git a/README.md b/README.md index 5aaaf94..249fc7b 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,132 @@ -# job-scraper +# Job Scraper -Track openings for companies I am interested in \ No newline at end of file +Monitor job openings from privacy-focused and open-source companies. Runs daily and shows changes. + +## Quick Start (Local) + +```bash +# Create venv and install deps +python3 -m venv venv +source venv/bin/activate +pip install -r requirements.txt + +# Run once +python main.py + +# View dashboard +open data/dashboard.html +``` + +## Deploy to Debian Server + +### 1. Install Docker + +```bash +# Install Docker +curl -fsSL https://get.docker.com | sh +sudo usermod -aG docker $USER +# Log out and back in + +# Install Docker Compose +sudo apt install docker-compose-plugin +``` + +### 2. Clone/Copy the project + +```bash +# Copy project to server +scp -r job-scraper user@your-server:~/ + +# Or clone from git if you pushed it +git clone ~/job-scraper +``` + +### 3. Run with Docker Compose + +```bash +cd ~/job-scraper + +# Run scraper once to populate data +docker compose run --rm scraper + +# Start dashboard + scheduled scraper +docker compose up -d scraper-scheduled dashboard + +# View logs +docker compose logs -f +``` + +### 4. Access the dashboard + +Open `http://your-server:8080` in your browser. + +### Optional: Use a reverse proxy + +If you want HTTPS or a custom domain, add nginx/caddy in front: + +```bash +# Example with Caddy (auto HTTPS) +sudo apt install caddy +echo "jobs.yourdomain.com { + reverse_proxy localhost:8080 +}" | sudo tee /etc/caddy/Caddyfile +sudo systemctl reload caddy +``` + +## Commands + +```bash +# Run scraper once +docker compose run --rm scraper + +# Run scraper with schedule (daily 9 AM) +docker compose up -d scraper-scheduled + +# Start web dashboard +docker compose up -d dashboard + +# View all jobs +docker compose run --rm scraper python main.py --list + +# Stop everything +docker compose down + +# View logs +docker compose logs -f scraper-scheduled +``` + +## Configuration + +Edit `config.yaml` to: +- Add/remove companies +- Change location filters +- Configure email/Slack notifications + +## Dashboard Features + +- Dark theme, monospace font +- Filter jobs by typing (press `/` to focus, `Esc` to clear) +- Color-coded tags: `remote`, `canada`, `berlin` +- Jump to company links +- Updates automatically when scraper runs + +## Project Structure + +``` +job-scraper/ +├── main.py # CLI entry point +├── db.py # SQLite database +├── dashboard.py # HTML generator +├── notify.py # Notifications +├── scrapers/ # Platform scrapers +│ ├── base.py # Base class +│ ├── greenhouse.py # Greenhouse API +│ ├── lever.py # Lever API +│ └── ashby.py # Ashby API +├── config.yaml # Company list & settings +├── Dockerfile +├── docker-compose.yaml +└── data/ + ├── jobs.db # SQLite database + └── dashboard.html # Generated dashboard +``` diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..79ac383 --- /dev/null +++ b/config.yaml @@ -0,0 +1,116 @@ +# Job Scraper Configuration +# =========================== + +# Location filters - jobs matching these locations will be highlighted +location_filters: + - remote + - canada + - toronto + - vancouver + - berlin + - germany + +# Job title filters - only jobs containing these keywords will be tracked +# Leave empty or remove to track all jobs +title_filters: + - engineer + - developer + - software + - sre + - devops + - infrastructure + - platform + - backend + - frontend + - fullstack + - full-stack + - security + +# Companies to monitor +# Each company needs: name, platform, and platform-specific config +companies: + # Privacy & Security Focused + - name: Signal + platform: lever + lever_company: signal + + - name: DuckDuckGo + platform: ashby + ashby_company: duck-duck-go + + - name: 1Password + platform: ashby + ashby_company: 1password + + - name: Bitwarden + platform: greenhouse + board_token: bitwarden + + # Open Source Infrastructure & DevTools + - name: GrafanaLabs + platform: greenhouse + board_token: grafanalabs + + - name: GitLab + platform: greenhouse + board_token: gitlab + + - name: Sourcegraph + platform: greenhouse + board_token: sourcegraph91 + + - name: Supabase + platform: ashby + ashby_company: supabase + + - name: Tailscale + platform: greenhouse + board_token: tailscale + + - name: HashiCorp + platform: greenhouse + board_token: hashicorp + + # Developer Tools & Platforms + - name: Automattic + platform: greenhouse + board_token: automatticcareers + + - name: Canonical + platform: greenhouse + board_token: canonical + + - name: ClickHouse + platform: greenhouse + board_token: clickhouse + + - name: Cloudflare + platform: greenhouse + board_token: cloudflare + +# Notification settings (optional - configure as needed) +notifications: + # Console output is always enabled + console: true + + # Uncomment and configure for email notifications + # email: + # smtp_host: smtp.gmail.com + # smtp_port: 587 + # username: your-email@gmail.com + # password: your-app-password + # from_addr: your-email@gmail.com + # to_addr: your-email@gmail.com + + # Uncomment for Slack webhook + # slack: + # webhook_url: https://hooks.slack.com/services/YOUR/WEBHOOK/URL + +# Scraper settings +scraper: + # Delay between requests in seconds (be respectful!) + request_delay: 2 + # Timeout for requests in seconds + timeout: 30 + # Number of retries on failure + retries: 3 diff --git a/dashboard.py b/dashboard.py new file mode 100644 index 0000000..dd4ff1f --- /dev/null +++ b/dashboard.py @@ -0,0 +1,385 @@ +#!/usr/bin/env python3 +""" +Generate a simple text-based HTML dashboard of all tracked jobs. +""" + +from datetime import datetime +from pathlib import Path + +from db import Database + + +def generate_dashboard(output_path: str = "data/dashboard.html"): + """Generate a static HTML dashboard.""" + db = Database() + jobs = db.get_all_active_jobs() + + # Group by company + companies = {} + for company_name, job in jobs: + if company_name not in companies: + companies[company_name] = [] + companies[company_name].append(job) + + # Sort companies by name + sorted_companies = sorted(companies.items()) + + html = f""" + + + + + Job Board + + + +
+

$ job-board

+
+ Last updated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} | + {len(jobs)} jobs across {len(companies)} companies +
+
+ +
+ +
+ + + + + + + + + + + +
+
+ {len(jobs)} jobs shown +
+
+ +
+
Jump to company:
+ +
+ +
+""" + + # Job listings + for company_name, company_jobs in sorted_companies: + anchor = company_name.lower().replace(" ", "-") + html += f""" +
+
+ {company_name} + {len(company_jobs)} positions +
+
+""" + for job in sorted(company_jobs, key=lambda j: j.title): + location = job.location or "" + location_lower = location.lower() + + # Tags + tags = "" + if job.remote_type == "remote" or "remote" in location_lower: + tags += 'remote' + if "canada" in location_lower or "toronto" in location_lower or "vancouver" in location_lower: + tags += 'canada' + if "berlin" in location_lower or "germany" in location_lower: + tags += 'berlin' + + html += f"""
+ {job.title}{tags} + {location} +
+""" + html += """
+
+""" + + html += """
+ + + + +""" + + # Write the file + output = Path(output_path) + output.parent.mkdir(parents=True, exist_ok=True) + output.write_text(html) + print(f"Dashboard generated: {output_path}") + return output_path + + +if __name__ == "__main__": + generate_dashboard() diff --git a/db.py b/db.py new file mode 100644 index 0000000..7e13a17 --- /dev/null +++ b/db.py @@ -0,0 +1,238 @@ +import sqlite3 +from contextlib import contextmanager +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Optional + +from scrapers.base import Job + + +@dataclass +class StoredJob: + """A job stored in the database.""" + id: int + company_id: int + external_id: str + title: str + url: str + location: Optional[str] + department: Optional[str] + remote_type: Optional[str] + first_seen: datetime + last_seen: datetime + status: str # 'active' or 'removed' + + +class Database: + """SQLite database for storing job listings.""" + + def __init__(self, db_path: str = "data/jobs.db"): + self.db_path = Path(db_path) + self.db_path.parent.mkdir(parents=True, exist_ok=True) + self._init_db() + + @contextmanager + def _get_conn(self): + """Get a database connection.""" + conn = sqlite3.connect(self.db_path) + conn.row_factory = sqlite3.Row + try: + yield conn + conn.commit() + finally: + conn.close() + + def _init_db(self): + """Initialize the database schema.""" + with self._get_conn() as conn: + conn.executescript(""" + CREATE TABLE IF NOT EXISTS companies ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT NOT NULL UNIQUE, + jobs_url TEXT, + platform_type TEXT, + last_scraped TIMESTAMP, + active BOOLEAN DEFAULT TRUE + ); + + CREATE TABLE IF NOT EXISTS jobs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + company_id INTEGER REFERENCES companies(id), + external_id TEXT NOT NULL, + title TEXT NOT NULL, + url TEXT NOT NULL, + location TEXT, + department TEXT, + remote_type TEXT, + first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + last_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + status TEXT DEFAULT 'active', + UNIQUE(company_id, external_id) + ); + + CREATE INDEX IF NOT EXISTS idx_jobs_company ON jobs(company_id); + CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status); + """) + + def get_or_create_company(self, name: str, jobs_url: str = None, platform_type: str = None) -> int: + """Get or create a company and return its ID.""" + with self._get_conn() as conn: + cursor = conn.execute( + "SELECT id FROM companies WHERE name = ?", (name,) + ) + row = cursor.fetchone() + if row: + return row["id"] + + cursor = conn.execute( + "INSERT INTO companies (name, jobs_url, platform_type) VALUES (?, ?, ?)", + (name, jobs_url, platform_type) + ) + return cursor.lastrowid + + def update_company_scraped(self, company_id: int): + """Update the last_scraped timestamp for a company.""" + with self._get_conn() as conn: + conn.execute( + "UPDATE companies SET last_scraped = ? WHERE id = ?", + (datetime.now(), company_id) + ) + + def get_active_jobs(self, company_id: int) -> dict[str, StoredJob]: + """Get all active jobs for a company, keyed by external_id.""" + with self._get_conn() as conn: + cursor = conn.execute( + """SELECT * FROM jobs WHERE company_id = ? AND status = 'active'""", + (company_id,) + ) + jobs = {} + for row in cursor.fetchall(): + job = StoredJob( + id=row["id"], + company_id=row["company_id"], + external_id=row["external_id"], + title=row["title"], + url=row["url"], + location=row["location"], + department=row["department"], + remote_type=row["remote_type"], + first_seen=row["first_seen"], + last_seen=row["last_seen"], + status=row["status"] + ) + jobs[job.external_id] = job + return jobs + + def upsert_job(self, company_id: int, job: Job) -> tuple[bool, Optional[StoredJob]]: + """ + Insert or update a job. + Returns (is_new, old_job) where old_job is the previous version if it existed. + """ + with self._get_conn() as conn: + # Check if job exists + cursor = conn.execute( + "SELECT * FROM jobs WHERE company_id = ? AND external_id = ?", + (company_id, job.external_id) + ) + existing = cursor.fetchone() + + if existing: + # Update last_seen and ensure status is active + conn.execute( + """UPDATE jobs SET + title = ?, url = ?, location = ?, department = ?, + remote_type = ?, last_seen = ?, status = 'active' + WHERE id = ?""", + (job.title, job.url, job.location, job.department, + job.remote_type, datetime.now(), existing["id"]) + ) + old_job = StoredJob( + id=existing["id"], + company_id=existing["company_id"], + external_id=existing["external_id"], + title=existing["title"], + url=existing["url"], + location=existing["location"], + department=existing["department"], + remote_type=existing["remote_type"], + first_seen=existing["first_seen"], + last_seen=existing["last_seen"], + status=existing["status"] + ) + return False, old_job + else: + # Insert new job + conn.execute( + """INSERT INTO jobs + (company_id, external_id, title, url, location, department, remote_type) + VALUES (?, ?, ?, ?, ?, ?, ?)""", + (company_id, job.external_id, job.title, job.url, + job.location, job.department, job.remote_type) + ) + return True, None + + def mark_jobs_removed(self, company_id: int, external_ids: set[str]) -> list[StoredJob]: + """Mark jobs as removed. Returns the jobs that were marked removed.""" + if not external_ids: + return [] + + removed = [] + with self._get_conn() as conn: + placeholders = ",".join("?" * len(external_ids)) + cursor = conn.execute( + f"""SELECT * FROM jobs + WHERE company_id = ? AND external_id IN ({placeholders}) AND status = 'active'""", + (company_id, *external_ids) + ) + + for row in cursor.fetchall(): + removed.append(StoredJob( + id=row["id"], + company_id=row["company_id"], + external_id=row["external_id"], + title=row["title"], + url=row["url"], + location=row["location"], + department=row["department"], + remote_type=row["remote_type"], + first_seen=row["first_seen"], + last_seen=row["last_seen"], + status=row["status"] + )) + + conn.execute( + f"""UPDATE jobs SET status = 'removed', last_seen = ? + WHERE company_id = ? AND external_id IN ({placeholders})""", + (datetime.now(), company_id, *external_ids) + ) + + return removed + + def get_all_active_jobs(self) -> list[tuple[str, StoredJob]]: + """Get all active jobs across all companies. Returns (company_name, job) tuples.""" + with self._get_conn() as conn: + cursor = conn.execute( + """SELECT c.name as company_name, j.* + FROM jobs j + JOIN companies c ON j.company_id = c.id + WHERE j.status = 'active' + ORDER BY c.name, j.title""" + ) + results = [] + for row in cursor.fetchall(): + job = StoredJob( + id=row["id"], + company_id=row["company_id"], + external_id=row["external_id"], + title=row["title"], + url=row["url"], + location=row["location"], + department=row["department"], + remote_type=row["remote_type"], + first_seen=row["first_seen"], + last_seen=row["last_seen"], + status=row["status"] + ) + results.append((row["company_name"], job)) + return results diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..16229da --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,35 @@ +services: + # Run scraper once (for manual/cron triggering) + scraper: + build: . + container_name: job-scraper + volumes: + - ./data:/app/data + - ./config.yaml:/app/config.yaml:ro + environment: + - TZ=America/Toronto + + # Scheduled scraper - runs daily at 9 AM + scraper-scheduled: + build: . + container_name: job-scraper-scheduled + volumes: + - ./data:/app/data + - ./config.yaml:/app/config.yaml:ro + environment: + - TZ=America/Toronto + command: ["python", "main.py", "--schedule"] + restart: unless-stopped + + # Web dashboard - lightweight static file server + dashboard: + image: nginx:alpine + container_name: job-dashboard + ports: + - "8080:80" + volumes: + - ./data:/usr/share/nginx/html:ro + - ./nginx.conf:/etc/nginx/conf.d/default.conf:ro + restart: unless-stopped + depends_on: + - scraper diff --git a/main.py b/main.py new file mode 100644 index 0000000..ef6a1be --- /dev/null +++ b/main.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python3 +""" +Job Scraper - Monitor job openings from companies you're interested in. + +Usage: + python main.py # Run once + python main.py --schedule # Run daily at configured time + python main.py --list # List all tracked jobs +""" + +import argparse +import sys +import time +from datetime import datetime +from pathlib import Path + +import yaml + +from db import Database +from notify import ChangeReport, Notifier +from scrapers import AshbyScraper, GreenhouseScraper, LeverScraper +from scrapers.base import BaseScraper, Job +from dashboard import generate_dashboard + + +def load_config(config_path: str = "config.yaml") -> dict: + """Load configuration from YAML file.""" + with open(config_path) as f: + return yaml.safe_load(f) + + +def get_scraper(company_config: dict) -> BaseScraper: + """Create a scraper instance based on company configuration.""" + platform = company_config["platform"] + name = company_config["name"] + + if platform == "greenhouse": + return GreenhouseScraper(name, company_config["board_token"]) + elif platform == "lever": + return LeverScraper(name, company_config["lever_company"]) + elif platform == "ashby": + return AshbyScraper(name, company_config["ashby_company"]) + else: + raise ValueError(f"Unknown platform: {platform}") + + +def filter_jobs_by_title(jobs: list[Job], title_filters: list[str]) -> list[Job]: + """Filter jobs to only include those matching title keywords.""" + if not title_filters: + return jobs + + filtered = [] + for job in jobs: + title_lower = job.title.lower() + if any(keyword.lower() in title_lower for keyword in title_filters): + filtered.append(job) + return filtered + + +def scrape_company(company_config: dict, db: Database, config: dict) -> ChangeReport: + """Scrape jobs for a single company and detect changes.""" + name = company_config["name"] + print(f"\n🔍 Scraping {name}...", end=" ", flush=True) + + try: + with get_scraper(company_config) as scraper: + # Get current jobs from the career page + all_jobs = scraper.scrape() + + # Filter by title keywords if configured + title_filters = config.get("title_filters", []) + current_jobs = filter_jobs_by_title(all_jobs, title_filters) + + print(f"found {len(current_jobs)} jobs (of {len(all_jobs)} total)") + + # Get or create company in database + company_id = db.get_or_create_company( + name, + jobs_url=company_config.get("board_token", company_config.get("lever_company", "")), + platform_type=company_config["platform"] + ) + + # Get stored jobs + stored_jobs = db.get_active_jobs(company_id) + + # Detect changes + current_ids = {job.external_id for job in current_jobs} + stored_ids = set(stored_jobs.keys()) + + new_ids = current_ids - stored_ids + removed_ids = stored_ids - current_ids + + # Process new jobs + new_jobs = [] + for job in current_jobs: + is_new, _ = db.upsert_job(company_id, job) + if is_new: + new_jobs.append(job) + + # Mark removed jobs + removed_jobs = db.mark_jobs_removed(company_id, removed_ids) + + # Update last scraped time + db.update_company_scraped(company_id) + + # Apply location filters to highlight relevant jobs + location_filters = config.get("location_filters", []) + if location_filters and new_jobs: + relevant_new = [] + for job in new_jobs: + if job.location: + loc_lower = job.location.lower() + if any(f.lower() in loc_lower for f in location_filters): + relevant_new.append(job) + elif job.remote_type == "remote": + relevant_new.append(job) + + if relevant_new: + print(f" ⭐ {len(relevant_new)} jobs match your location filters!") + + return ChangeReport( + company_name=name, + new_jobs=new_jobs, + removed_jobs=removed_jobs, + total_active=len(current_jobs) + ) + + except Exception as e: + print(f"ERROR: {e}") + return ChangeReport( + company_name=name, + new_jobs=[], + removed_jobs=[], + total_active=0 + ) + + +def run_scraper(config: dict): + """Run the scraper for all configured companies.""" + print(f"\n{'=' * 60}") + print(f"Job Scraper - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print(f"{'=' * 60}") + + db = Database() + notifier = Notifier(config.get("notifications", {})) + + companies = config.get("companies", []) + print(f"\nMonitoring {len(companies)} companies...") + + reports = [] + delay = config.get("scraper", {}).get("request_delay", 2) + + for i, company_config in enumerate(companies): + report = scrape_company(company_config, db, config) + reports.append(report) + + # Delay between companies (be respectful!) + if i < len(companies) - 1: + time.sleep(delay) + + # Send notifications + notifier.notify(reports) + + # Summary + total_jobs = sum(r.total_active for r in reports) + total_new = sum(len(r.new_jobs) for r in reports) + total_removed = sum(len(r.removed_jobs) for r in reports) + + print(f"\n📊 Total: {total_jobs} active jobs across {len(companies)} companies") + print(f" Changes: +{total_new} new, -{total_removed} removed") + + # Generate dashboard + generate_dashboard() + + +def list_jobs(config: dict): + """List all tracked jobs.""" + db = Database() + jobs = db.get_all_active_jobs() + + if not jobs: + print("No jobs tracked yet. Run the scraper first.") + return + + print(f"\n{'=' * 60}") + print(f"All Tracked Jobs ({len(jobs)} total)") + print(f"{'=' * 60}") + + current_company = None + for company_name, job in jobs: + if company_name != current_company: + print(f"\n📌 {company_name}") + print("-" * 40) + current_company = company_name + + location = f" [{job.location}]" if job.location else "" + remote = " 🏠" if job.remote_type == "remote" else "" + print(f" • {job.title}{location}{remote}") + print(f" {job.url}") + + +def run_scheduled(config: dict): + """Run the scraper on a schedule.""" + import schedule + + print("Starting scheduled job scraper...") + print("Will run daily at 09:00") + print("Press Ctrl+C to stop\n") + + # Run immediately on start + run_scraper(config) + + # Schedule daily run + schedule.every().day.at("09:00").do(run_scraper, config) + + while True: + schedule.run_pending() + time.sleep(60) + + +def main(): + parser = argparse.ArgumentParser(description="Job Scraper - Monitor job openings") + parser.add_argument("--config", default="config.yaml", help="Path to config file") + parser.add_argument("--schedule", action="store_true", help="Run on a schedule") + parser.add_argument("--list", action="store_true", help="List all tracked jobs") + + args = parser.parse_args() + + # Load config + config_path = Path(args.config) + if not config_path.exists(): + print(f"Error: Config file not found: {config_path}") + sys.exit(1) + + config = load_config(args.config) + + if args.list: + list_jobs(config) + elif args.schedule: + run_scheduled(config) + else: + run_scraper(config) + + +if __name__ == "__main__": + main() diff --git a/nginx.conf b/nginx.conf new file mode 100644 index 0000000..371a2ac --- /dev/null +++ b/nginx.conf @@ -0,0 +1,24 @@ +server { + listen 80; + server_name _; + root /usr/share/nginx/html; + + # Serve dashboard.html as the index + location / { + try_files /dashboard.html =404; + } + + # Cache static assets + location ~* \.(html|css|js)$ { + expires 5m; + add_header Cache-Control "public, no-transform"; + } + + # Security headers + add_header X-Frame-Options "SAMEORIGIN" always; + add_header X-Content-Type-Options "nosniff" always; + + # Gzip + gzip on; + gzip_types text/html text/css application/javascript; +} diff --git a/notify.py b/notify.py new file mode 100644 index 0000000..a31a511 --- /dev/null +++ b/notify.py @@ -0,0 +1,178 @@ +from dataclasses import dataclass +from typing import Optional +import json + +from db import StoredJob +from scrapers.base import Job + + +@dataclass +class ChangeReport: + """Report of changes detected during a scrape.""" + company_name: str + new_jobs: list[Job] + removed_jobs: list[StoredJob] + total_active: int + + +class Notifier: + """Handles notifications for job changes.""" + + def __init__(self, config: dict): + self.config = config + + def notify(self, reports: list[ChangeReport]): + """Send notifications for all changes.""" + # Filter to only reports with changes + reports_with_changes = [r for r in reports if r.new_jobs or r.removed_jobs] + + if not reports_with_changes: + print("\n✓ No changes detected across all companies.") + return + + # Console output (always) + self._notify_console(reports_with_changes) + + # Email (if configured) + email_config = self.config.get("email") + if email_config: + self._notify_email(reports_with_changes, email_config) + + # Slack (if configured) + slack_config = self.config.get("slack") + if slack_config: + self._notify_slack(reports_with_changes, slack_config) + + def _notify_console(self, reports: list[ChangeReport]): + """Print changes to console.""" + print("\n" + "=" * 60) + print("JOB CHANGES DETECTED") + print("=" * 60) + + total_new = sum(len(r.new_jobs) for r in reports) + total_removed = sum(len(r.removed_jobs) for r in reports) + + print(f"\nSummary: {total_new} new jobs, {total_removed} removed jobs\n") + + for report in reports: + print(f"\n📌 {report.company_name} ({report.total_active} active jobs)") + print("-" * 40) + + if report.new_jobs: + print(f"\n 🆕 NEW JOBS ({len(report.new_jobs)}):") + for job in report.new_jobs: + location_str = f" [{job.location}]" if job.location else "" + remote_str = f" 🏠" if job.remote_type == "remote" else "" + print(f" • {job.title}{location_str}{remote_str}") + print(f" {job.url}") + + if report.removed_jobs: + print(f"\n ❌ REMOVED JOBS ({len(report.removed_jobs)}):") + for job in report.removed_jobs: + print(f" • {job.title}") + + print("\n" + "=" * 60) + + def _notify_email(self, reports: list[ChangeReport], config: dict): + """Send email notification.""" + import smtplib + from email.mime.text import MIMEText + from email.mime.multipart import MIMEMultipart + + # Build email body + body = self._build_html_report(reports) + + msg = MIMEMultipart("alternative") + msg["Subject"] = f"Job Alert: {sum(len(r.new_jobs) for r in reports)} new positions" + msg["From"] = config["from_addr"] + msg["To"] = config["to_addr"] + + msg.attach(MIMEText(body, "html")) + + try: + with smtplib.SMTP(config["smtp_host"], config["smtp_port"]) as server: + server.starttls() + server.login(config["username"], config["password"]) + server.send_message(msg) + print("✓ Email notification sent") + except Exception as e: + print(f"✗ Failed to send email: {e}") + + def _notify_slack(self, reports: list[ChangeReport], config: dict): + """Send Slack notification.""" + import httpx + + blocks = [] + + # Header + total_new = sum(len(r.new_jobs) for r in reports) + blocks.append({ + "type": "header", + "text": {"type": "plain_text", "text": f"🔔 {total_new} New Job Openings"} + }) + + for report in reports: + if report.new_jobs: + blocks.append({"type": "divider"}) + blocks.append({ + "type": "section", + "text": { + "type": "mrkdwn", + "text": f"*{report.company_name}* ({len(report.new_jobs)} new)" + } + }) + + for job in report.new_jobs[:5]: # Limit to 5 per company + location = f" • {job.location}" if job.location else "" + blocks.append({ + "type": "section", + "text": { + "type": "mrkdwn", + "text": f"<{job.url}|{job.title}>{location}" + } + }) + + payload = {"blocks": blocks} + + try: + response = httpx.post(config["webhook_url"], json=payload) + response.raise_for_status() + print("✓ Slack notification sent") + except Exception as e: + print(f"✗ Failed to send Slack notification: {e}") + + def _build_html_report(self, reports: list[ChangeReport]) -> str: + """Build HTML email body.""" + total_new = sum(len(r.new_jobs) for r in reports) + + html = f""" + + +

🔔 {total_new} New Job Openings

+ """ + + for report in reports: + if report.new_jobs: + html += f""" +

+ {report.company_name} +

+ " + + html += """ + + + """ + return html diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5949ab3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +httpx>=0.27.0 +beautifulsoup4>=4.12.0 +lxml>=5.0.0 +pyyaml>=6.0 +schedule>=1.2.0 diff --git a/scrapers/__init__.py b/scrapers/__init__.py new file mode 100644 index 0000000..195c9f9 --- /dev/null +++ b/scrapers/__init__.py @@ -0,0 +1,6 @@ +from .base import BaseScraper, Job +from .greenhouse import GreenhouseScraper +from .lever import LeverScraper +from .ashby import AshbyScraper + +__all__ = ["BaseScraper", "Job", "GreenhouseScraper", "LeverScraper", "AshbyScraper"] diff --git a/scrapers/ashby.py b/scrapers/ashby.py new file mode 100644 index 0000000..b275e37 --- /dev/null +++ b/scrapers/ashby.py @@ -0,0 +1,51 @@ +from .base import BaseScraper, Job + + +class AshbyScraper(BaseScraper): + """ + Scraper for companies using Ashby. + Ashby provides a JSON API endpoint. + + Example: https://api.ashbyhq.com/posting-api/job-board/{company} + """ + + def __init__(self, company_name: str, ashby_company: str, **kwargs): + # Ashby API endpoint + jobs_url = f"https://api.ashbyhq.com/posting-api/job-board/{ashby_company}" + super().__init__(company_name, jobs_url, **kwargs) + self.ashby_company = ashby_company + + def scrape(self) -> list[Job]: + """Scrape jobs from Ashby API.""" + data = self.fetch_json() + jobs = [] + + for job_data in data.get("jobs", []): + job_id = job_data.get("id", "") + title = job_data.get("title", "") + job_url = job_data.get("jobUrl", "") + + # Location info + location = job_data.get("location", "") + department = job_data.get("department", "") + + # Employment type + employment_type = job_data.get("employmentType", "") + + # Check for remote + is_remote = job_data.get("isRemote", False) + if is_remote: + remote_type = "remote" + else: + remote_type = self.classify_remote(location) + + jobs.append(Job( + external_id=job_id, + title=title, + url=job_url, + location=location, + department=department, + remote_type=remote_type + )) + + return jobs diff --git a/scrapers/base.py b/scrapers/base.py new file mode 100644 index 0000000..c07c80d --- /dev/null +++ b/scrapers/base.py @@ -0,0 +1,76 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass +from datetime import datetime +from typing import Optional +import httpx + + +@dataclass +class Job: + """Represents a job listing.""" + external_id: str + title: str + url: str + location: Optional[str] = None + department: Optional[str] = None + remote_type: Optional[str] = None # 'remote', 'hybrid', 'onsite' + + def __hash__(self): + return hash(self.external_id) + + def __eq__(self, other): + if isinstance(other, Job): + return self.external_id == other.external_id + return False + + +class BaseScraper(ABC): + """Base class for all job scrapers.""" + + def __init__(self, company_name: str, jobs_url: str, timeout: int = 30): + self.company_name = company_name + self.jobs_url = jobs_url + self.timeout = timeout + self.client = httpx.Client( + timeout=timeout, + headers={ + "User-Agent": "JobScraper/1.0 (Personal job search tool)" + }, + follow_redirects=True + ) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.client.close() + + def fetch(self, url: Optional[str] = None) -> str: + """Fetch the content from a URL.""" + target_url = url or self.jobs_url + response = self.client.get(target_url) + response.raise_for_status() + return response.text + + def fetch_json(self, url: Optional[str] = None) -> dict: + """Fetch JSON from a URL.""" + target_url = url or self.jobs_url + response = self.client.get(target_url) + response.raise_for_status() + return response.json() + + @abstractmethod + def scrape(self) -> list[Job]: + """Scrape jobs from the company's career page. Must be implemented by subclasses.""" + pass + + def classify_remote(self, location: str) -> Optional[str]: + """Try to classify if a job is remote based on location text.""" + if not location: + return None + location_lower = location.lower() + if "remote" in location_lower: + if "hybrid" in location_lower: + return "hybrid" + return "remote" + return "onsite" diff --git a/scrapers/greenhouse.py b/scrapers/greenhouse.py new file mode 100644 index 0000000..6ccf606 --- /dev/null +++ b/scrapers/greenhouse.py @@ -0,0 +1,42 @@ +from .base import BaseScraper, Job + + +class GreenhouseScraper(BaseScraper): + """ + Scraper for companies using Greenhouse. + Greenhouse provides a JSON API at /embed/job_board/jobs endpoint. + + Example: https://boards-api.greenhouse.io/v1/boards/{company}/jobs + """ + + def __init__(self, company_name: str, board_token: str, **kwargs): + # Greenhouse API endpoint + jobs_url = f"https://boards-api.greenhouse.io/v1/boards/{board_token}/jobs" + super().__init__(company_name, jobs_url, **kwargs) + self.board_token = board_token + + def scrape(self) -> list[Job]: + """Scrape jobs from Greenhouse API.""" + data = self.fetch_json() + jobs = [] + + for job_data in data.get("jobs", []): + job_id = str(job_data.get("id", "")) + title = job_data.get("title", "") + location = job_data.get("location", {}).get("name", "") + absolute_url = job_data.get("absolute_url", "") + + # Get department if available + departments = job_data.get("departments", []) + department = departments[0].get("name") if departments else None + + jobs.append(Job( + external_id=job_id, + title=title, + url=absolute_url, + location=location, + department=department, + remote_type=self.classify_remote(location) + )) + + return jobs diff --git a/scrapers/lever.py b/scrapers/lever.py new file mode 100644 index 0000000..f93a945 --- /dev/null +++ b/scrapers/lever.py @@ -0,0 +1,50 @@ +from .base import BaseScraper, Job + + +class LeverScraper(BaseScraper): + """ + Scraper for companies using Lever. + Lever provides a JSON API at /v0/postings/{company} endpoint. + + Example: https://api.lever.co/v0/postings/{company} + """ + + def __init__(self, company_name: str, lever_company: str, **kwargs): + # Lever API endpoint + jobs_url = f"https://api.lever.co/v0/postings/{lever_company}" + super().__init__(company_name, jobs_url, **kwargs) + self.lever_company = lever_company + + def scrape(self) -> list[Job]: + """Scrape jobs from Lever API.""" + data = self.fetch_json() + jobs = [] + + for job_data in data: + job_id = job_data.get("id", "") + title = job_data.get("text", "") + hosted_url = job_data.get("hostedUrl", "") + + # Location info + categories = job_data.get("categories", {}) + location = categories.get("location", "") + department = categories.get("department", "") + commitment = categories.get("commitment", "") # Full-time, Part-time, etc. + + # Check for remote in work type + work_type = categories.get("workplaceType", "") + if work_type: + remote_type = self.classify_remote(work_type) + else: + remote_type = self.classify_remote(location) + + jobs.append(Job( + external_id=job_id, + title=title, + url=hosted_url, + location=location, + department=department, + remote_type=remote_type + )) + + return jobs