Refined
This commit is contained in:
parent
185b5ce2f1
commit
fd4254df3e
9 changed files with 1332 additions and 4176 deletions
65
CLAUDE.md
Normal file
65
CLAUDE.md
Normal file
|
|
@ -0,0 +1,65 @@
|
||||||
|
# Job Scraper
|
||||||
|
|
||||||
|
Job board monitoring for privacy-focused and open-source companies.
|
||||||
|
|
||||||
|
## Quick Reference
|
||||||
|
|
||||||
|
| Item | Value |
|
||||||
|
|------|-------|
|
||||||
|
| URL | https://jobs.novanexus.ca |
|
||||||
|
| Port | 8085 |
|
||||||
|
| Containers | job-scraper-scheduled, job-dashboard |
|
||||||
|
| Data | ./data/jobs.db, ./data/dashboard.html |
|
||||||
|
|
||||||
|
## Development Workflow
|
||||||
|
|
||||||
|
After making code changes, rebuild and deploy:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd ~/job-scraper && sudo docker compose build && sudo docker compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common Commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# View status
|
||||||
|
sudo docker compose ps
|
||||||
|
|
||||||
|
# View logs
|
||||||
|
sudo docker compose logs -f scraper-scheduled
|
||||||
|
|
||||||
|
# Run scraper manually (one-time)
|
||||||
|
sudo docker compose run --rm scraper
|
||||||
|
|
||||||
|
# Restart services
|
||||||
|
sudo docker compose restart
|
||||||
|
|
||||||
|
# Rebuild without cache (if having issues)
|
||||||
|
sudo docker compose build --no-cache && sudo docker compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
- `config.yaml` - Companies to monitor, filters, notifications
|
||||||
|
- After config changes: `sudo docker compose restart scraper-scheduled`
|
||||||
|
|
||||||
|
## Email Notifications
|
||||||
|
|
||||||
|
Uses msmtp with system config (`~/.msmtprc`). The container mounts this file.
|
||||||
|
|
||||||
|
To test email manually:
|
||||||
|
```bash
|
||||||
|
sudo docker compose run --rm scraper
|
||||||
|
```
|
||||||
|
|
||||||
|
## Files
|
||||||
|
|
||||||
|
- `main.py` - CLI entry point, scheduling
|
||||||
|
- `db.py` - SQLite database operations
|
||||||
|
- `notify.py` - Console, msmtp, email, Slack notifications
|
||||||
|
- `dashboard.py` - HTML dashboard generator
|
||||||
|
- `scrapers/` - Platform-specific scrapers (greenhouse, lever, ashby)
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
|
||||||
|
Full docs: ~/maple-docs/docs/04-services/job-scraper.md
|
||||||
|
|
@ -2,6 +2,12 @@ FROM python:3.12-slim
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install msmtp for email notifications
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
msmtp \
|
||||||
|
ca-certificates \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Install dependencies
|
# Install dependencies
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
|
||||||
|
|
@ -154,7 +154,12 @@ notifications:
|
||||||
# Console output is always enabled
|
# Console output is always enabled
|
||||||
console: true
|
console: true
|
||||||
|
|
||||||
# Uncomment and configure for email notifications
|
# msmtp - uses system msmtp config (~/.msmtprc)
|
||||||
|
msmtp:
|
||||||
|
from_addr: admin@novanexus.ca
|
||||||
|
to_addr: me@bastiangruber.ca
|
||||||
|
|
||||||
|
# Uncomment and configure for SMTP email notifications (alternative to msmtp)
|
||||||
# email:
|
# email:
|
||||||
# smtp_host: smtp.gmail.com
|
# smtp_host: smtp.gmail.com
|
||||||
# smtp_port: 587
|
# smtp_port: 587
|
||||||
|
|
|
||||||
36
dashboard.py
36
dashboard.py
|
|
@ -133,6 +133,14 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
|
||||||
db = Database()
|
db = Database()
|
||||||
jobs = db.get_all_active_jobs()
|
jobs = db.get_all_active_jobs()
|
||||||
|
|
||||||
|
# Get all monitored companies
|
||||||
|
all_company_names = db.get_all_companies()
|
||||||
|
|
||||||
|
# Track total jobs per company (before location filtering)
|
||||||
|
total_per_company = {}
|
||||||
|
for company_name, job in jobs:
|
||||||
|
total_per_company[company_name] = total_per_company.get(company_name, 0) + 1
|
||||||
|
|
||||||
# Group by company, filtering out irrelevant remote locations
|
# Group by company, filtering out irrelevant remote locations
|
||||||
companies = {}
|
companies = {}
|
||||||
filtered_count = 0
|
filtered_count = 0
|
||||||
|
|
@ -144,7 +152,15 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
|
||||||
companies[company_name] = []
|
companies[company_name] = []
|
||||||
companies[company_name].append(job)
|
companies[company_name].append(job)
|
||||||
|
|
||||||
|
# Ensure all monitored companies are in the dict (even with 0 jobs)
|
||||||
|
for name in all_company_names:
|
||||||
|
if name not in companies:
|
||||||
|
companies[name] = []
|
||||||
|
if name not in total_per_company:
|
||||||
|
total_per_company[name] = 0
|
||||||
|
|
||||||
total_shown = sum(len(jobs) for jobs in companies.values())
|
total_shown = sum(len(jobs) for jobs in companies.values())
|
||||||
|
total_scraped = sum(total_per_company.values())
|
||||||
|
|
||||||
# Sort companies by name
|
# Sort companies by name
|
||||||
sorted_companies = sorted(companies.items())
|
sorted_companies = sorted(companies.items())
|
||||||
|
|
@ -334,6 +350,13 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
|
||||||
.toc-links a:hover {{
|
.toc-links a:hover {{
|
||||||
text-decoration: underline;
|
text-decoration: underline;
|
||||||
}}
|
}}
|
||||||
|
.toc-links .empty {{
|
||||||
|
color: var(--muted);
|
||||||
|
cursor: default;
|
||||||
|
}}
|
||||||
|
.toc-links .empty:hover {{
|
||||||
|
text-decoration: none;
|
||||||
|
}}
|
||||||
.filter-buttons {{
|
.filter-buttons {{
|
||||||
display: flex;
|
display: flex;
|
||||||
flex-wrap: wrap;
|
flex-wrap: wrap;
|
||||||
|
|
@ -367,7 +390,7 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
|
||||||
<h1>$ job-board</h1>
|
<h1>$ job-board</h1>
|
||||||
<div class="meta">
|
<div class="meta">
|
||||||
Last updated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} |
|
Last updated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} |
|
||||||
{total_shown} jobs across {len(companies)} companies
|
{total_shown}/{total_scraped} jobs (location filtered) | Monitoring {len(all_company_names)} companies
|
||||||
</div>
|
</div>
|
||||||
</header>
|
</header>
|
||||||
|
|
||||||
|
|
@ -402,7 +425,12 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
|
||||||
# Table of contents
|
# Table of contents
|
||||||
for company_name, company_jobs in sorted_companies:
|
for company_name, company_jobs in sorted_companies:
|
||||||
anchor = company_name.lower().replace(" ", "-")
|
anchor = company_name.lower().replace(" ", "-")
|
||||||
html += f' <a href="#{anchor}">{company_name} ({len(company_jobs)})</a>\n'
|
filtered = len(company_jobs)
|
||||||
|
total = total_per_company.get(company_name, 0)
|
||||||
|
if filtered > 0:
|
||||||
|
html += f' <a href="#{anchor}">{company_name} ({filtered}/{total})</a>\n'
|
||||||
|
else:
|
||||||
|
html += f' <span class="empty">{company_name} (0/{total})</span>\n'
|
||||||
|
|
||||||
html += """ </div>
|
html += """ </div>
|
||||||
</div>
|
</div>
|
||||||
|
|
@ -410,8 +438,10 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
|
||||||
<main id="job-list">
|
<main id="job-list">
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Job listings
|
# Job listings (only for companies with jobs)
|
||||||
for company_name, company_jobs in sorted_companies:
|
for company_name, company_jobs in sorted_companies:
|
||||||
|
if not company_jobs:
|
||||||
|
continue # Skip companies with no jobs after filtering
|
||||||
anchor = company_name.lower().replace(" ", "-")
|
anchor = company_name.lower().replace(" ", "-")
|
||||||
html += f"""
|
html += f"""
|
||||||
<div class="company" id="{anchor}">
|
<div class="company" id="{anchor}">
|
||||||
|
|
|
||||||
5297
data/dashboard.html
5297
data/dashboard.html
File diff suppressed because it is too large
Load diff
Binary file not shown.
13
db.py
13
db.py
|
|
@ -35,8 +35,11 @@ class Database:
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def _get_conn(self):
|
def _get_conn(self):
|
||||||
"""Get a database connection."""
|
"""Get a database connection."""
|
||||||
conn = sqlite3.connect(self.db_path)
|
conn = sqlite3.connect(self.db_path, timeout=30.0)
|
||||||
conn.row_factory = sqlite3.Row
|
conn.row_factory = sqlite3.Row
|
||||||
|
# Enable WAL mode for better concurrency
|
||||||
|
conn.execute("PRAGMA journal_mode=WAL")
|
||||||
|
conn.execute("PRAGMA busy_timeout=30000")
|
||||||
try:
|
try:
|
||||||
yield conn
|
yield conn
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
@ -236,3 +239,11 @@ class Database:
|
||||||
)
|
)
|
||||||
results.append((row["company_name"], job))
|
results.append((row["company_name"], job))
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
def get_all_companies(self) -> list[str]:
|
||||||
|
"""Get all company names from the database."""
|
||||||
|
with self._get_conn() as conn:
|
||||||
|
cursor = conn.execute(
|
||||||
|
"SELECT name FROM companies WHERE active = TRUE ORDER BY name"
|
||||||
|
)
|
||||||
|
return [row["name"] for row in cursor.fetchall()]
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ services:
|
||||||
volumes:
|
volumes:
|
||||||
- ./data:/app/data
|
- ./data:/app/data
|
||||||
- ./config.yaml:/app/config.yaml:ro
|
- ./config.yaml:/app/config.yaml:ro
|
||||||
|
- /home/gruberb/.msmtprc:/root/.msmtprc:ro
|
||||||
environment:
|
environment:
|
||||||
- TZ=America/Toronto
|
- TZ=America/Toronto
|
||||||
|
|
||||||
|
|
@ -16,6 +17,7 @@ services:
|
||||||
volumes:
|
volumes:
|
||||||
- ./data:/app/data
|
- ./data:/app/data
|
||||||
- ./config.yaml:/app/config.yaml:ro
|
- ./config.yaml:/app/config.yaml:ro
|
||||||
|
- /home/gruberb/.msmtprc:/root/.msmtprc:ro
|
||||||
environment:
|
environment:
|
||||||
- TZ=America/Toronto
|
- TZ=America/Toronto
|
||||||
command: ["python", "main.py", "--schedule"]
|
command: ["python", "main.py", "--schedule"]
|
||||||
|
|
|
||||||
82
notify.py
82
notify.py
|
|
@ -38,6 +38,11 @@ class Notifier:
|
||||||
if email_config:
|
if email_config:
|
||||||
self._notify_email(reports_with_changes, email_config)
|
self._notify_email(reports_with_changes, email_config)
|
||||||
|
|
||||||
|
# msmtp (if configured - uses system msmtp config)
|
||||||
|
msmtp_config = self.config.get("msmtp")
|
||||||
|
if msmtp_config:
|
||||||
|
self._notify_msmtp(reports_with_changes, msmtp_config)
|
||||||
|
|
||||||
# Slack (if configured)
|
# Slack (if configured)
|
||||||
slack_config = self.config.get("slack")
|
slack_config = self.config.get("slack")
|
||||||
if slack_config:
|
if slack_config:
|
||||||
|
|
@ -98,6 +103,83 @@ class Notifier:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"✗ Failed to send email: {e}")
|
print(f"✗ Failed to send email: {e}")
|
||||||
|
|
||||||
|
def _notify_msmtp(self, reports: list[ChangeReport], config: dict):
|
||||||
|
"""Send email notification via system msmtp."""
|
||||||
|
import subprocess
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
to_addr = config.get("to_addr", "me@bastiangruber.ca")
|
||||||
|
from_addr = config.get("from_addr", "admin@novanexus.ca")
|
||||||
|
|
||||||
|
total_new = sum(len(r.new_jobs) for r in reports)
|
||||||
|
total_removed = sum(len(r.removed_jobs) for r in reports)
|
||||||
|
|
||||||
|
# Build subject line
|
||||||
|
parts = []
|
||||||
|
if total_new:
|
||||||
|
parts.append(f"+{total_new} new")
|
||||||
|
if total_removed:
|
||||||
|
parts.append(f"-{total_removed} removed")
|
||||||
|
subject = f"Job Board Update: {', '.join(parts)}"
|
||||||
|
|
||||||
|
# Build plain text body
|
||||||
|
body_lines = [
|
||||||
|
"JOB BOARD CHANGES",
|
||||||
|
f"{datetime.now().strftime('%Y-%m-%d %H:%M')}",
|
||||||
|
"",
|
||||||
|
f"Summary: {total_new} new jobs, {total_removed} removed jobs",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
|
||||||
|
for report in reports:
|
||||||
|
body_lines.append(f"{report.company_name} ({report.total_active} active)")
|
||||||
|
body_lines.append("-" * 40)
|
||||||
|
|
||||||
|
if report.new_jobs:
|
||||||
|
body_lines.append(f" NEW ({len(report.new_jobs)}):")
|
||||||
|
for job in report.new_jobs:
|
||||||
|
location_str = f" [{job.location}]" if job.location else ""
|
||||||
|
remote_str = " (Remote)" if job.remote_type == "remote" else ""
|
||||||
|
body_lines.append(f" + {job.title}{location_str}{remote_str}")
|
||||||
|
body_lines.append(f" {job.url}")
|
||||||
|
|
||||||
|
if report.removed_jobs:
|
||||||
|
body_lines.append(f" REMOVED ({len(report.removed_jobs)}):")
|
||||||
|
for job in report.removed_jobs:
|
||||||
|
body_lines.append(f" - {job.title}")
|
||||||
|
|
||||||
|
body_lines.append("")
|
||||||
|
|
||||||
|
body_lines.append("---")
|
||||||
|
body_lines.append("Generated by job-scraper")
|
||||||
|
|
||||||
|
body = "\n".join(body_lines)
|
||||||
|
|
||||||
|
# Build email message
|
||||||
|
email_msg = f"""Subject: {subject}
|
||||||
|
From: {from_addr}
|
||||||
|
To: {to_addr}
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
|
||||||
|
{body}
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["msmtp", to_addr],
|
||||||
|
input=email_msg,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
if result.returncode == 0:
|
||||||
|
print("✓ msmtp notification sent")
|
||||||
|
else:
|
||||||
|
print(f"✗ msmtp failed: {result.stderr}")
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("✗ msmtp not found - install with: apt install msmtp")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ Failed to send msmtp notification: {e}")
|
||||||
|
|
||||||
def _notify_slack(self, reports: list[ChangeReport], config: dict):
|
def _notify_slack(self, reports: list[ChangeReport], config: dict):
|
||||||
"""Send Slack notification."""
|
"""Send Slack notification."""
|
||||||
import httpx
|
import httpx
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue