Update filters and add cleanup

This commit is contained in:
Bastian Gruber 2026-01-29 16:24:44 +00:00
parent fd4254df3e
commit c655f2e078
Signed by: gruberb
GPG key ID: 426AF1CBA0530691
7 changed files with 4694 additions and 1242 deletions

View file

@ -72,10 +72,6 @@ companies:
platform: greenhouse
board_token: automatticcareers
- name: Canonical
platform: greenhouse
board_token: canonical
- name: ClickHouse
platform: greenhouse
board_token: clickhouse

View file

@ -3,168 +3,193 @@
Generate a simple text-based HTML dashboard of all tracked jobs.
"""
import re
from datetime import datetime
from pathlib import Path
from collections import Counter
from db import Database
# Regions/locations we care about (case-insensitive matching)
DESIRED_REGIONS = [
"canada", "toronto", "vancouver",
"germany", "berlin", "munich",
"emea",
"americas", # includes North/South America
"north america",
"worldwide", "global", "anywhere",
]
# Locations to explicitly exclude (on-site or remote restricted to these)
EXCLUDED_LOCATIONS = [
# US cities/states (we don't want US-only jobs)
"san francisco", "new york", "nyc", "seattle", "austin", "boston",
"chicago", "denver", "los angeles", "atlanta", "dallas", "houston",
"california", "washington", "texas", "massachusetts", "colorado",
"united states", "usa", "u.s.", "us-", "usa-",
# Location grouping rules: keyword -> (group_id, display_name)
# Order matters - first match wins
LOCATION_RULES = [
# Canada
(["canada", "toronto", "vancouver", "montreal", "ottawa", "calgary", "waterloo"], "canada", "Canada"),
# Germany
(["germany", "berlin", "munich", "frankfurt", "hamburg"], "germany", "Germany"),
# UK
"london", "united kingdom", "uk", "dublin", "ireland",
# Australia/APAC (not EMEA)
"sydney", "melbourne", "australia", "singapore", "tokyo", "japan",
"india", "bangalore", "bengaluru", "hyderabad", "delhi",
"korea", "seoul", "taiwan", "taipei", "china", "beijing", "shenzhen",
# Other excluded
"israel", "tel aviv", "brazil", "sao paulo", "mexico",
"netherlands", "amsterdam", "france", "paris", "spain", "madrid",
"portugal", "lisbon", "poland", "warsaw", "italy",
"czech", "prague", "serbia", "belgrade", "cyprus", "limassol",
"austria", "vienna", "sweden", "stockholm", "denmark", "copenhagen",
"switzerland", "romania", "bucharest", "hungary", "greece",
"south africa", "indonesia", "jakarta", "malaysia",
(["united kingdom", " uk", "uk ", "london", "england", "manchester", "edinburgh"], "uk", "UK"),
# Ireland
(["ireland", "dublin"], "ireland", "Ireland"),
# Netherlands
(["netherlands", "amsterdam", "rotterdam"], "netherlands", "Netherlands"),
# France
(["france", "paris"], "france", "France"),
# Spain
(["spain", "madrid", "barcelona"], "spain", "Spain"),
# Poland
(["poland", "warsaw", "krakow", "wroclaw"], "poland", "Poland"),
# Sweden
(["sweden", "stockholm"], "sweden", "Sweden"),
# Switzerland
(["switzerland", "zurich", "geneva"], "switzerland", "Switzerland"),
# Australia
(["australia", "sydney", "melbourne"], "australia", "Australia"),
# India
(["india", "bangalore", "bengaluru", "hyderabad", "delhi", "mumbai", "pune"], "india", "India"),
# Japan
(["japan", "tokyo"], "japan", "Japan"),
# Singapore
(["singapore"], "singapore", "Singapore"),
# Israel
(["israel", "tel aviv"], "israel", "Israel"),
# Brazil
(["brazil", "sao paulo"], "brazil", "Brazil"),
# US (must be after other countries to avoid false matches)
(["united states", "usa", "u.s.", "san francisco", "new york", "nyc", "seattle",
"austin", "boston", "chicago", "denver", "los angeles", "atlanta", "dallas",
"houston", "california", "washington", "texas", "massachusetts", "colorado",
"portland", "miami", "phoenix", "san diego", "san jose", "palo alto",
"mountain view", "sunnyvale", "menlo park", "cupertino"], "us", "US"),
# Regions
(["emea"], "emea", "EMEA"),
(["americas", "north america", "latam"], "americas", "Americas"),
(["apac", "asia pacific", "asia-pacific"], "apac", "APAC"),
(["worldwide", "global", "anywhere", "earth"], "worldwide", "Worldwide"),
]
def is_location_relevant(location: str, remote_type: str) -> bool:
def extract_location_info(location: str, remote_type: str) -> tuple[list[str], str]:
"""
Strict location filter. Only keeps jobs available in Canada, Germany, EMEA, or Worldwide.
Filters out US-only jobs, UK jobs, APAC jobs, etc.
Extract location tags and short display text from a job's location.
Returns (list of tag ids, short display location)
"""
if not location:
return False # No location info = probably US-based, filter out
loc_lower = location.lower()
# Check if any desired region is mentioned FIRST
has_desired = any(region in loc_lower for region in DESIRED_REGIONS)
# If it has a desired region, keep it (even if it also mentions excluded locations)
# e.g., "Remote (United States | Canada)" should be kept because of Canada
if has_desired:
return True
# If it just says "Remote" with nothing else, keep it (truly remote)
if loc_lower.strip() == "remote":
return True
# Check for excluded locations
has_excluded = any(excl in loc_lower for excl in EXCLUDED_LOCATIONS)
if has_excluded:
return False
# Check for patterns like "In-Office", "Hybrid", "On-site" without desired region
if any(x in loc_lower for x in ["in-office", "hybrid", "on-site", "onsite", "office based"]):
return False
# If we can't determine, filter it out (safer)
return False
def extract_location_tags(location: str, remote_type: str) -> tuple[list[str], str]:
"""
Extract relevant location tags and a short display location.
Returns (list of tag names, short location string)
"""
if not location:
return [], ""
loc_lower = location.lower()
tags = []
short_loc = ""
display = ""
if not location:
return tags, display
loc_lower = location.lower()
# Check for remote
is_remote = remote_type == "remote" or "remote" in loc_lower
if is_remote:
tags.append("remote")
# Check for Canada
if any(x in loc_lower for x in ["canada", "toronto", "vancouver"]):
tags.append("canada")
short_loc = "Canada"
# Check against location rules
for keywords, tag_id, display_name in LOCATION_RULES:
if any(kw in loc_lower for kw in keywords):
if tag_id not in tags:
tags.append(tag_id)
if not display:
display = display_name
# Check for Germany/Berlin
if any(x in loc_lower for x in ["germany", "berlin", "munich"]):
tags.append("germany")
short_loc = "Germany" if "germany" in loc_lower else "Berlin"
# Fallback display
if not display:
if is_remote:
display = "Remote"
elif location:
display = location[:25] + "..." if len(location) > 25 else location
# Check for EMEA
if "emea" in loc_lower:
tags.append("emea")
short_loc = "EMEA"
# Check for Americas/North America
if "americas" in loc_lower or "north america" in loc_lower:
tags.append("americas")
short_loc = "Americas"
# Check for Worldwide
if any(x in loc_lower for x in ["worldwide", "global", "anywhere"]):
tags.append("worldwide")
short_loc = "Worldwide"
# If no specific region found but it's remote
if not short_loc and is_remote:
short_loc = "Remote"
return tags, short_loc
return tags, display
def generate_dashboard(output_path: str = "data/dashboard.html"):
"""Generate a static HTML dashboard."""
db = Database()
jobs = db.get_all_active_jobs()
# Get all monitored companies
all_company_names = db.get_all_companies()
# Track total jobs per company (before location filtering)
total_per_company = {}
for company_name, job in jobs:
total_per_company[company_name] = total_per_company.get(company_name, 0) + 1
# Group by company, filtering out irrelevant remote locations
# Process all jobs and collect location data
companies = {}
filtered_count = 0
location_counts = Counter()
for company_name, job in jobs:
if not is_location_relevant(job.location, job.remote_type):
filtered_count += 1
continue
# Extract location info
tags, display = extract_location_info(job.location, job.remote_type)
# Count locations for filter generation
for tag in tags:
location_counts[tag] += 1
# Store processed job data
if company_name not in companies:
companies[company_name] = []
companies[company_name].append(job)
# Ensure all monitored companies are in the dict (even with 0 jobs)
companies[company_name].append({
"job": job,
"tags": tags,
"display": display,
"search_text": f"{job.title.lower()} {(job.location or '').lower()} {(job.department or '').lower()} {' '.join(tags)}"
})
# Ensure all companies exist (even with 0 jobs)
for name in all_company_names:
if name not in companies:
companies[name] = []
if name not in total_per_company:
total_per_company[name] = 0
total_shown = sum(len(jobs) for jobs in companies.values())
total_scraped = sum(total_per_company.values())
# Sort companies by name
total_jobs = sum(len(j) for j in companies.values())
sorted_companies = sorted(companies.items())
# Generate dynamic location filters (only show locations that exist in data)
# Order: Remote first, then by count descending
location_filters = []
if "remote" in location_counts:
location_filters.append(("remote", "Remote", location_counts["remote"]))
# Add other locations sorted by count
other_locations = [(tag, count) for tag, count in location_counts.items() if tag != "remote"]
other_locations.sort(key=lambda x: -x[1])
# Map tag_id to display name
tag_display = {tag_id: display for keywords, tag_id, display in LOCATION_RULES}
tag_display["remote"] = "Remote"
for tag_id, count in other_locations:
display = tag_display.get(tag_id, tag_id.title())
location_filters.append((tag_id, display, count))
# Generate location filter buttons HTML
location_buttons = ""
for tag_id, display, count in location_filters:
location_buttons += f' <button class="filter-btn" data-filter="{tag_id}" data-category="location">{display} ({count})</button>\n'
# Generate tag colors dynamically
tag_colors = {
"remote": ("#1a4a1a", "#4ade80"),
"canada": ("#4a1a1a", "#f87171"),
"germany": ("#4a4a1a", "#facc15"),
"uk": ("#2a1a3a", "#a78bfa"),
"us": ("#3a2a1a", "#fb923c"),
"emea": ("#1a3a4a", "#60a5fa"),
"americas": ("#3a1a4a", "#c084fc"),
"worldwide": ("#1a4a3a", "#34d399"),
"apac": ("#1a2a4a", "#38bdf8"),
"ireland": ("#1a4a2a", "#4ade80"),
"netherlands": ("#3a3a1a", "#fbbf24"),
"france": ("#2a2a4a", "#818cf8"),
"spain": ("#4a2a1a", "#fb7185"),
"poland": ("#3a1a2a", "#f472b6"),
"sweden": ("#1a3a3a", "#2dd4bf"),
"switzerland": ("#4a1a2a", "#fb7185"),
"australia": ("#2a3a1a", "#a3e635"),
"india": ("#4a3a1a", "#fcd34d"),
"japan": ("#4a1a3a", "#e879f9"),
"singapore": ("#1a4a4a", "#22d3d1"),
"israel": ("#3a2a2a", "#fca5a5"),
"brazil": ("#2a4a1a", "#86efac"),
}
# Generate CSS for tags
tag_css = ""
for tag_id, (bg, fg) in tag_colors.items():
tag_css += f""" .tag-{tag_id} {{
background: {bg};
color: {fg};
}}
"""
html = f"""<!DOCTYPE html>
<html lang="en">
<head>
@ -299,30 +324,7 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
font-size: 11px;
margin-left: 5px;
}}
.tag-remote {{
background: #1a4a1a;
color: #4ade80;
}}
.tag-canada {{
background: #4a1a1a;
color: #f87171;
}}
.tag-berlin {{
background: #4a4a1a;
color: #facc15;
}}
.tag-emea {{
background: #1a3a4a;
color: #60a5fa;
}}
.tag-americas {{
background: #3a1a4a;
color: #c084fc;
}}
.tag-worldwide {{
background: #1a4a3a;
color: #34d399;
}}
{tag_css}
.hidden {{
display: none;
}}
@ -342,26 +344,32 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
flex-wrap: wrap;
gap: 10px;
}}
.toc-links a {{
.toc-link {{
color: var(--accent);
text-decoration: none;
font-size: 13px;
}}
.toc-links a:hover {{
.toc-link:hover {{
text-decoration: underline;
}}
.toc-links .empty {{
.toc-link.empty {{
color: var(--muted);
cursor: default;
}}
.toc-links .empty:hover {{
text-decoration: none;
.toc-link.hidden {{
display: none;
}}
.filter-buttons {{
.filter-section {{
display: flex;
flex-wrap: wrap;
gap: 8px;
margin-top: 10px;
align-items: center;
}}
.filter-label {{
color: var(--muted);
font-size: 12px;
margin-right: 4px;
min-width: 60px;
}}
.filter-btn {{
background: var(--bg);
@ -383,6 +391,13 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
border-color: var(--accent);
color: var(--bg);
}}
.clear-btn {{
border-color: #666;
}}
.clear-btn:hover {{
border-color: #f87171;
color: #f87171;
}}
</style>
</head>
<body>
@ -390,47 +405,47 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
<h1>$ job-board</h1>
<div class="meta">
Last updated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} |
{total_shown}/{total_scraped} jobs (location filtered) | Monitoring {len(all_company_names)} companies
{total_jobs} jobs | {len(all_company_names)} companies
</div>
</header>
<div class="filters">
<input type="text" id="search" placeholder="Filter jobs... (e.g. 'senior engineer', 'remote', 'canada')" autofocus>
<div class="filter-buttons">
<button class="filter-btn" data-filter="">All</button>
<button class="filter-btn" data-filter="engineer">Engineering</button>
<button class="filter-btn" data-filter="senior engineer">Senior Eng</button>
<button class="filter-btn" data-filter="staff principal">Staff+</button>
<button class="filter-btn" data-filter="manager director">Management</button>
<button class="filter-btn" data-filter="product">Product</button>
<button class="filter-btn" data-filter="design">Design</button>
<button class="filter-btn" data-filter="security">Security</button>
<button class="filter-btn" data-filter="remote">Remote</button>
<button class="filter-btn" data-filter="canada">Canada</button>
<button class="filter-btn" data-filter="germany">Germany</button>
<button class="filter-btn" data-filter="emea">EMEA</button>
<button class="filter-btn" data-filter="americas">Americas</button>
<button class="filter-btn" data-filter="worldwide">Worldwide</button>
<input type="text" id="search" placeholder="Filter jobs... (press / to focus, Esc to clear)" autofocus>
<div class="filter-section">
<span class="filter-label">Quick:</span>
<button class="filter-btn" data-filter="" data-category="all">All ({total_jobs})</button>
<button class="filter-btn clear-btn" data-action="clear">Clear Filters</button>
</div>
<div class="filter-section">
<span class="filter-label">Location:</span>
{location_buttons} </div>
<div class="filter-section">
<span class="filter-label">Role:</span>
<button class="filter-btn" data-filter="engineer" data-category="role">Engineering</button>
<button class="filter-btn" data-filter="senior" data-category="role">Senior</button>
<button class="filter-btn" data-filter="staff principal" data-category="role">Staff+</button>
<button class="filter-btn" data-filter="backend" data-category="role">Backend</button>
<button class="filter-btn" data-filter="frontend" data-category="role">Frontend</button>
<button class="filter-btn" data-filter="infrastructure platform sre" data-category="role">Infra/Platform</button>
<button class="filter-btn" data-filter="security" data-category="role">Security</button>
<button class="filter-btn" data-filter="manager director" data-category="role">Management</button>
</div>
<div class="stats">
<span id="visible-count">{total_shown} jobs shown</span>
<span id="visible-count">{total_jobs} jobs shown</span>
</div>
</div>
<div class="toc">
<div class="toc-title">Jump to company:</div>
<div class="toc-links">
<div class="toc-links" id="toc-links">
"""
# Table of contents
# Table of contents with data attributes for JS updating
for company_name, company_jobs in sorted_companies:
anchor = company_name.lower().replace(" ", "-")
filtered = len(company_jobs)
total = total_per_company.get(company_name, 0)
if filtered > 0:
html += f' <a href="#{anchor}">{company_name} ({filtered}/{total})</a>\n'
else:
html += f' <span class="empty">{company_name} (0/{total})</span>\n'
anchor = company_name.lower().replace(" ", "-").replace("'", "")
count = len(company_jobs)
css_class = "toc-link" if count > 0 else "toc-link empty"
html += f' <a href="#{anchor}" class="{css_class}" data-company="{anchor}" data-total="{count}">{company_name} ({count})</a>\n'
html += """ </div>
</div>
@ -438,44 +453,34 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
<main id="job-list">
"""
# Job listings (only for companies with jobs)
# Job listings
for company_name, company_jobs in sorted_companies:
if not company_jobs:
continue # Skip companies with no jobs after filtering
anchor = company_name.lower().replace(" ", "-")
continue
anchor = company_name.lower().replace(" ", "-").replace("'", "")
total = len(company_jobs)
html += f"""
<div class="company" id="{anchor}">
<div class="company" id="{anchor}" data-company="{anchor}" data-total="{total}">
<div class="company-header">
<span class="company-name">{company_name}</span>
<span class="company-count">{len(company_jobs)} positions</span>
<span class="company-count" data-total="{total}">{total} positions</span>
</div>
<div class="jobs">
"""
for job in sorted(company_jobs, key=lambda j: j.title):
location = job.location or ""
location_lower = location.lower()
# Extract tags and short location
tag_list, short_loc = extract_location_tags(location, job.remote_type)
for job_data in sorted(company_jobs, key=lambda j: j["job"].title):
job = job_data["job"]
tags = job_data["tags"]
display = job_data["display"]
search_text = job_data["search_text"]
# Build tag HTML
tags = ""
if "remote" in tag_list:
tags += '<span class="tag tag-remote">remote</span>'
if "canada" in tag_list:
tags += '<span class="tag tag-canada">canada</span>'
if "germany" in tag_list:
tags += '<span class="tag tag-berlin">germany</span>'
if "emea" in tag_list:
tags += '<span class="tag tag-emea">emea</span>'
if "americas" in tag_list:
tags += '<span class="tag tag-americas">americas</span>'
if "worldwide" in tag_list:
tags += '<span class="tag tag-worldwide">worldwide</span>'
tag_html = ""
for tag in tags:
tag_html += f'<span class="tag tag-{tag}">{tag}</span>'
html += f""" <div class="job" data-search="{job.title.lower()} {location_lower} {(job.department or '').lower()} {' '.join(tag_list)}">
<span class="job-title"><a href="{job.url}" target="_blank">{job.title}</a>{tags}</span>
<span class="job-location">{short_loc}</span>
html += f""" <div class="job" data-search="{search_text}">
<span class="job-title"><a href="{job.url}" target="_blank">{job.title}</a>{tag_html}</span>
<span class="job-location">{display}</span>
</div>
"""
html += """ </div>
@ -488,67 +493,155 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
const search = document.getElementById('search');
const jobs = document.querySelectorAll('.job');
const companies = document.querySelectorAll('.company');
const tocLinks = document.querySelectorAll('.toc-link');
const visibleCount = document.getElementById('visible-count');
const filterBtns = document.querySelectorAll('.filter-btn');
const clearBtn = document.querySelector('.clear-btn');
function filterJobs(query) {
let visible = 0;
const terms = query.toLowerCase().trim().split(/\\s+/).filter(t => t);
// Track active filters by category
const activeFilters = {
location: null,
role: null
};
function applyFilters() {
let totalVisible = 0;
const searchTerms = search.value.toLowerCase().trim().split(/\\s+/).filter(t => t);
// Build filter terms from active category filters
const locationTerms = activeFilters.location ? activeFilters.location.split(/\\s+/) : [];
const roleTerms = activeFilters.role ? activeFilters.role.split(/\\s+/) : [];
const hasFilters = searchTerms.length > 0 || locationTerms.length > 0 || roleTerms.length > 0;
// Track visible counts per company
const companyCounts = {};
companies.forEach(company => {
const companyId = company.dataset.company;
const companyJobs = company.querySelectorAll('.job');
let companyVisible = 0;
companyJobs.forEach(job => {
const searchText = job.dataset.search;
// Match if ANY term matches (OR logic for filter buttons)
const matches = terms.length === 0 || terms.some(term => searchText.includes(term));
// Match logic: AND between categories, OR within each category
let matches = true;
// Search box (OR within terms)
if (searchTerms.length > 0) {
matches = matches && searchTerms.some(term => searchText.includes(term));
}
// Location filter (OR within terms)
if (locationTerms.length > 0) {
matches = matches && locationTerms.some(term => searchText.includes(term));
}
// Role filter (OR within terms)
if (roleTerms.length > 0) {
matches = matches && roleTerms.some(term => searchText.includes(term));
}
job.classList.toggle('hidden', !matches);
if (matches) {
companyVisible++;
visible++;
totalVisible++;
}
});
company.classList.toggle('hidden', companyVisible === 0);
companyCounts[companyId] = companyVisible;
// Update company header count
const countSpan = company.querySelector('.company-count');
const total = parseInt(countSpan.dataset.total);
if (!hasFilters) {
countSpan.textContent = `${total} positions`;
} else {
countSpan.textContent = `${companyVisible}/${total} positions`;
}
});
visibleCount.textContent = `${visible} jobs shown`;
// Update TOC links - always show all, grey out empty ones
tocLinks.forEach(link => {
const companyId = link.dataset.company;
const total = parseInt(link.dataset.total);
const visible = companyCounts[companyId] || 0;
const name = link.textContent.replace(/\\s*\\(.*\\)/, '');
if (!hasFilters) {
link.textContent = `${name} (${total})`;
link.classList.toggle('empty', total === 0);
} else {
link.textContent = `${name} (${visible}/${total})`;
link.classList.toggle('empty', visible === 0);
}
// Always show the link, never hide
link.classList.remove('hidden');
});
visibleCount.textContent = `${totalVisible} jobs shown`;
}
search.addEventListener('input', (e) => {
// Clear active button when typing
function clearAllFilters() {
search.value = '';
activeFilters.location = null;
activeFilters.role = null;
filterBtns.forEach(btn => btn.classList.remove('active'));
filterJobs(e.target.value);
applyFilters();
}
search.addEventListener('input', () => {
applyFilters();
});
// Filter buttons
filterBtns.forEach(btn => {
btn.addEventListener('click', () => {
const filter = btn.dataset.filter;
search.value = filter;
filterBtns.forEach(b => b.classList.remove('active'));
const category = btn.dataset.category;
const action = btn.dataset.action;
// Handle clear button
if (action === 'clear') {
clearAllFilters();
return;
}
// Handle "All" button
if (category === 'all') {
clearAllFilters();
return;
}
// Toggle filter in category
const categoryBtns = document.querySelectorAll(`.filter-btn[data-category="${category}"]`);
if (btn.classList.contains('active')) {
// Deselect
btn.classList.remove('active');
activeFilters[category] = null;
} else {
// Select (deselect others in same category)
categoryBtns.forEach(b => b.classList.remove('active'));
btn.classList.add('active');
filterJobs(filter);
activeFilters[category] = filter;
}
applyFilters();
});
});
// Keyboard shortcut: / to focus search
document.addEventListener('keydown', (e) => {
if (e.key === '/' && document.activeElement !== search) {
e.preventDefault();
search.focus();
}
if (e.key === 'Escape') {
search.value = '';
filterBtns.forEach(b => b.classList.remove('active'));
filterJobs('');
clearAllFilters();
search.blur();
}
});
// Set "All" as active by default
filterBtns[0].classList.add('active');
</script>
</body>
</html>

File diff suppressed because it is too large Load diff

23
db.py
View file

@ -247,3 +247,26 @@ class Database:
"SELECT name FROM companies WHERE active = TRUE ORDER BY name"
)
return [row["name"] for row in cursor.fetchall()]
def cleanup_removed_companies(self, active_company_names: list[str]) -> list[str]:
"""
Remove companies (and their jobs) that are no longer in the config.
Returns list of removed company names.
"""
with self._get_conn() as conn:
# Get companies in DB but not in config
placeholders = ",".join("?" * len(active_company_names))
cursor = conn.execute(
f"SELECT id, name FROM companies WHERE name NOT IN ({placeholders})",
active_company_names
)
removed = []
for row in cursor.fetchall():
company_id = row["id"]
company_name = row["name"]
# Delete jobs first (foreign key)
conn.execute("DELETE FROM jobs WHERE company_id = ?", (company_id,))
# Delete company
conn.execute("DELETE FROM companies WHERE id = ?", (company_id,))
removed.append(company_name)
return removed

View file

@ -20,6 +20,7 @@ services:
- /home/gruberb/.msmtprc:/root/.msmtprc:ro
environment:
- TZ=America/Toronto
- PYTHONUNBUFFERED=1
command: ["python", "main.py", "--schedule"]
restart: unless-stopped
logging:

View file

@ -145,6 +145,13 @@ def run_scraper(config: dict):
notifier = Notifier(config.get("notifications", {}))
companies = config.get("companies", [])
# Cleanup companies no longer in config
active_names = [c["name"] for c in companies]
removed = db.cleanup_removed_companies(active_names)
if removed:
print(f"\n🧹 Removed {len(removed)} companies no longer in config: {', '.join(removed)}")
print(f"\nMonitoring {len(companies)} companies...")
reports = []

106
notify.py
View file

@ -28,24 +28,23 @@ class Notifier:
if not reports_with_changes:
print("\n✓ No changes detected across all companies.")
return
# Console output (always)
else:
# Console output for changes
self._notify_console(reports_with_changes)
# Email (if configured)
# Email (if configured) - only sends when there are changes
email_config = self.config.get("email")
if email_config:
if email_config and reports_with_changes:
self._notify_email(reports_with_changes, email_config)
# msmtp (if configured - uses system msmtp config)
# msmtp (if configured - sends daily summary always)
msmtp_config = self.config.get("msmtp")
if msmtp_config:
self._notify_msmtp(reports_with_changes, msmtp_config)
self._notify_msmtp_daily_summary(reports, msmtp_config)
# Slack (if configured)
# Slack (if configured) - only sends when there are changes
slack_config = self.config.get("slack")
if slack_config:
if slack_config and reports_with_changes:
self._notify_slack(reports_with_changes, slack_config)
def _notify_console(self, reports: list[ChangeReport]):
@ -180,6 +179,95 @@ Content-Type: text/plain; charset=UTF-8
except Exception as e:
print(f"✗ Failed to send msmtp notification: {e}")
def _notify_msmtp_daily_summary(self, reports: list[ChangeReport], config: dict):
"""Send daily summary email via system msmtp (always sends)."""
import subprocess
from datetime import datetime
to_addr = config.get("to_addr", "me@bastiangruber.ca")
from_addr = config.get("from_addr", "admin@novanexus.ca")
# Calculate totals
total_companies = len([r for r in reports if r.total_active > 0])
total_jobs = sum(r.total_active for r in reports)
total_new = sum(len(r.new_jobs) for r in reports)
total_removed = sum(len(r.removed_jobs) for r in reports)
# Build subject line
if total_new or total_removed:
changes = []
if total_new:
changes.append(f"+{total_new}")
if total_removed:
changes.append(f"-{total_removed}")
subject = f"Job Board: {', '.join(changes)} | {total_jobs} jobs"
else:
subject = f"Job Board: No changes | {total_jobs} jobs"
# Build plain text body
body_lines = [
"JOB BOARD DAILY SUMMARY",
f"{datetime.now().strftime('%Y-%m-%d %H:%M')}",
"",
"OVERVIEW",
f" Companies with jobs: {total_companies}",
f" Total jobs tracked: {total_jobs}",
"",
]
# Changes section
reports_with_changes = [r for r in reports if r.new_jobs or r.removed_jobs]
if reports_with_changes:
body_lines.append(f"CHANGES: +{total_new} new, -{total_removed} removed")
body_lines.append("-" * 40)
for report in reports_with_changes:
if report.new_jobs:
for job in report.new_jobs:
location_str = f" [{job.location}]" if job.location else ""
remote_str = " (Remote)" if job.remote_type == "remote" else ""
body_lines.append(f" + {report.company_name}: {job.title}{location_str}{remote_str}")
if report.removed_jobs:
for job in report.removed_jobs:
body_lines.append(f" - {report.company_name}: {job.title}")
body_lines.append("")
else:
body_lines.append("CHANGES: No changes detected")
body_lines.append("")
body_lines.append("---")
body_lines.append("https://jobs.novanexus.ca")
body = "\n".join(body_lines)
# Build email message
email_msg = f"""Subject: {subject}
From: {from_addr}
To: {to_addr}
Content-Type: text/plain; charset=UTF-8
{body}
"""
try:
result = subprocess.run(
["msmtp", to_addr],
input=email_msg,
capture_output=True,
text=True,
)
if result.returncode == 0:
print("✓ Daily summary email sent")
else:
print(f"✗ msmtp failed: {result.stderr}")
except FileNotFoundError:
print("✗ msmtp not found - install with: apt install msmtp")
except Exception as e:
print(f"✗ Failed to send daily summary: {e}")
def _notify_slack(self, reports: list[ChangeReport], config: dict):
"""Send Slack notification."""
import httpx