#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Location-Centric JobPosting Optimizer ===================================== This script is a specialized version dedicated to optimizing JobPosting NDJSON with a strong focus on **Job Type Themed Title Generation** for the US Market. What's New in this Version (Job-Type Optimization): - MAX_TITLE_LEN_BASE is set to 65 for guaranteed brevity. - The title generation logic uses a dynamic, prioritized selection process based on Job Type (FT, PT, Flexible) and secondary factors like Urgency and Experience Level. - Templates are simplified to be concise and high-impact. """ import argparse import json import random import re import logging import locale import hashlib from tqdm import tqdm from bs4 import BeautifulSoup from datetime import datetime, timedelta, timezone # ========================== # Locale initialization # ========================== try: locale.setlocale(locale.LC_ALL, "") except locale.Error: try: locale.setlocale(locale.LC_ALL, "en_US.UTF-8") except locale.Error: locale.setlocale(locale.LC_ALL, "C.UTF-8") logging.warning("Locale setting failed. Using 'C.UTF-8'. Number formatting might be basic.") # ========================== # Constants & Defaults # ========================== MAX_TITLE_LEN_BASE = 65 # Set to 65 for highly focused, short titles DESIRED_DEFAULT_FALLBACK_LOGO_URL = "https://bgcareers.us.com/logo.png" DEFAULT_CURRENCY = "USD" CONTACT_CTA = '' # Optional CTA footer DEFAULT_FULL_TIME_HOURS_PER_WEEK = 40 DEFAULT_PART_TIME_HOURS_PER_WEEK = 23 WEEKS_PER_YEAR = 52 MONTHS_PER_YEAR = 12 SALARY_ADJUSTMENT_FACTORS = [1.0, 0.92, 0.91, 0.88, 0.85, 1.05, 1.08, 1.10, 1.12, 1.15] WEIGHTED_SALARY_ADJUSTMENT_FACTORS = [1.0, 1.0, 1.0] + SALARY_ADJUSTMENT_FACTORS SALARY_UNIT_MAP = { 'HOUR': 'Hour', 'HOURLY': 'Hour', 'DAY': 'Day', 'DAILY': 'Day', 'WEEK': 'Week', 'WEEKLY': 'Week', 'MONTH': 'Month', 'MONTHLY': 'Month', 'YEAR': 'Year', 'ANNUAL': 'Year', 'ANNUALLY': 'Year', 'PROJECT': 'Project' } NORMALIZED_SALARY_UNITS = { "HOUR": "HOUR", "HOURLY": "HOUR", "DAY": "PROJECT", "DAILY": "PROJECT", "WEEK": "WEEK", "WEEKLY": "WEEK", "MONTH": "MONTH", "MONTHLY": "MONTH", "YEAR": "YEAR", "ANNUAL": "YEAR", "ANNUALLY": "YEAR", "PROJECT": "PROJECT" } EXPERIENCE_LEVEL_MONTHS = { "Entry-Level": (0, 12), "Mid-Level": (13, 60), "Senior-Level": (61, float('inf')) } EXPERIENCE_LEVEL_KEYWORDS = { "Entry-Level": ["entry level", "no experience", "graduate", "junior", "trainee", "intern"], "Mid-Level": ["mid level", "intermediate", "associate", "experienced"], "Senior-Level": ["senior", "lead", "principal", "expert", "staff level", "manager"] } RECENT_POST_DAYS = 2 CLOSING_SOON_DAYS = 7 # ================================================== # Template definitions (Shared) # ================================================== GEO_HEADER_TEMPLATES = [ "Local Job Overview: {city}", "Position in {city}, {state}", "Opportunity in {city}", "Role Based in {city}", "{city} Role Highlights", "Your Next Step in {city}" ] SUB_HEADER_TEMPLATES = { "primary": [ "Key Information", "Fast Facts", "Role Essentials", "At a Glance", "Primary Details", "Snapshot" ], "secondary": [ "Compensation & Benefits", "What You Get", "Pay & Perks", "Rewards", "Salary and Benefits" ], "tertiary": [ "Core Focus", "What You'll Do", "Responsibilities", "Day-to-Day", "Position Scope" ] } LOCAL_AREA_DESCRIPTORS = [ "a vibrant local hub", "a key business district", "our growing {city} office", "a central {city} location", "our main {city} branch" ] # ================================================== # Job-Type Themed Description Templates # ================================================== GEO_TONE_TEMPLATES = { "FULL_TIME": { 'informational': { 'intro': ["Full-Time Role in {city}: {role_for_summary}", "Position: {role_for_summary} (Full-Time, Based in {city})"], 'company': ["Employer: {company_for_summary} (Hiring in {city})"], 'location': ["Location: {city}, {state}", "Based in: {city}, {state} ({local_area_info})"], 'salary': ["Compensation: {salary_primary_display}{converted_salary_suffix}", "Pay: {salary_primary_display}{converted_salary_suffix}"], 'experience_entry': ["Experience: Entry-Level (Full Training in {city})."], 'experience_mid': ["Experience: Mid-Level."], 'experience_senior': ["Experience: Senior-Level."], 'urgency_new': ["Status: New {city} Full-Time listing (actively interviewing)."], 'urgency_closing': ["Urgency: Final applications accepted."], 'focus': ["Apply your {primary_skill} skills at our {city} location."], 'benefits': ["A comprehensive benefits package is included.", "Eligible team members receive full benefits."], 'value_props': ["Clear career pathways at our {city} office.", "A stable, long-term position with modern tooling."] } }, "PART_TIME": { 'informational': { 'intro': ["Part-Time Job in {city}: {role_for_summary}", "Position: {role_for_summary} (Part-Time, Based in {city})"], 'company': ["Employer: {company_for_summary} (Hiring for PT roles in {city})"], 'location': ["Location: {city}, {state}", "Based in: {city}, {state} ({local_area_info})"], 'salary': ["Compensation: {salary_primary_display}{converted_salary_suffix}", "Pay: {salary_primary_display}{converted_salary_suffix}"], 'experience_entry': ["Experience: Entry-Level (Training in {city})."], 'experience_mid': ["Experience: Mid-Level."], 'experience_senior': ["Experience: Senior-Level."], 'urgency_new': ["Status: New Part-Time listing (actively interviewing)."], 'urgency_closing': ["Urgency: Final applications for PT role accepted."], 'focus': ["Apply your {primary_skill} skills at our {city} location."], 'benefits': ["A competitive benefits package is available.", "Eligible part-time team members receive benefits."], 'value_props': ["Excellent work-life balance in {city}.", "Flexible scheduling to fit your needs."] } }, "FLEXIBLE": { 'informational': { 'intro': ["Flexible Role in {city}: {role_for_summary}", "Position: {role_for_summary} (Flexible/Contract, {city})"], 'company': ["Employer: {company_for_summary} (Hiring in {city})"], 'location': ["Location: {city}, {state}", "Based in: {city}, {state} ({local_area_info})"], 'salary': ["Compensation: {salary_primary_display}{converted_salary_suffix}", "Pay: {salary_primary_display}{converted_salary_suffix}"], 'experience_entry': ["Experience: Open to all levels."], 'experience_mid': ["Experience: Mid-Level."], 'experience_senior': ["Experience: Senior-Level."], 'urgency_new': ["Status: New {city} listing (actively interviewing)."], 'urgency_closing': ["Urgency: Final applications accepted."], 'focus': ["Apply your {primary_skill} skills at our {city} location."], 'benefits': ["A competitive compensation package is offered.", "Benefits may be available based on employment type."], 'value_props': ["Flexible contract/gig opportunity in {city}.", "Project-based work with clear objectives."] } } } HIRING_SYNS = [ "Hiring", "Immediate Start", "Now Hiring", "Apply Today", "Urgent Hire", "Join Team", "Recruiting", "Apply Now" ] ENTRY_SYNS = ["Entry Level", "No Experience", "Junior Role", "Graduate Role", "Training Provided"] URGENCY_TAGS_NEW_TITLE = ["New", "Just Posted", "Recent Job"] URGENCY_TAGS_CLOSING_TITLE = ["Apply Soon", "Hiring Now", "Urgent"] # ================================================== # NEW: Job-Type Themed Title Templates (CONCISE & PRIORITY BASED) # ================================================== GEO_FOCUSED_TITLE_TEMPLATES = { "FULL_TIME": { # PRIORITY 1: High-Impact & Geo-Focused (Role, Type, City/State) 'PRIMARY': [ "{city} Full-Time: {role}", "{role} Career - {city}, {state}", "{role} (Full-Time) - {city}", "{city} Career: {role}", ], # PRIORITY 2: Urgency/Experience Integration 'SECONDARY': [ "{urgency_tag}: {role} FT ({city})", "{experience_tag} {role} (Full-Time) | {city}", "{role} (FT) - {city}, {state} ({urgency_tag})", ], # PRIORITY 3: Simple Fallback 'FALLBACK': [ "{role} (FT) | {city}", "{city} {role} Job", ] }, "PART_TIME": { # PRIORITY 1: High-Impact & Geo-Focused (Role, Type, City/State) 'PRIMARY': [ "{city} Part-Time: {role}", "Flexible PT {role} ({city})", "{role} (PT) - {city}, {state}", ], # PRIORITY 2: Urgency/Experience Integration 'SECONDARY': [ "{urgency_tag}: {role} PT ({city})", "{experience_tag} {role} (Part-Time) | {city}", "Immediate Need: PT {role} - {city}", ], # PRIORITY 3: Simple Fallback 'FALLBACK': [ "{role} (Part-Time) | {city}", "{city} PT Opening: {role}", ] }, "FLEXIBLE": { # PRIORITY 1: High-Impact & Geo-Focused (Role, Type, City/State) 'PRIMARY': [ "{city} Contract: {role}", "{role} (Gig) - {city}", "Flexible {role} ({city})", ], # PRIORITY 2: Urgency/Experience Integration 'SECONDARY': [ "{urgency_tag}: {role} (Contract, {city})", "{experience_tag} {role} (Flexible) | {city}", "Immediate Opening: {role} (Gig)", ], # PRIORITY 3: Simple Fallback 'FALLBACK': [ "{role} (Contract) | {city}", "{city} Flexible Role: {role}", ] } } # --- END TEMPLATE CUSTOMIZATION --- JOB_SEO_KEYWORDS = [ "hiring", "apply now", "urgent", "career", "benefits", "salary", "immediate" ] SECTION_KEYWORDS_MAP = { "responsibilities": ["key responsibilities", "responsibilities", "your role", "what you'll do", "duties", "main duties", "primary accountabilities"], "requirements": ["requirements", "qualifications", "essential skills", "your qualifications", "must-have qualifications", "to succeed you'll need", "your profile"], "skills": ["skills", "technical skills", "soft skills", "key competencies"], "benefits": ["benefits", "perks", "what we offer", "why join us"], "experience": ["experience", "professional background", "experience level"], "incentives": ["incentives", "incentive compensation", "bonus", "commission"], "workhours": ["work hours", "hours of work", "schedule"] } # ========================== # Argparse # ========================== def parse_args(): p = argparse.ArgumentParser(description="Optimize Local JobPosting NDJSON with a strong location-centric focus.") p.add_argument('-i','--input', default='all-schemas.ndjson', help='Input NDJSON file') p.add_argument('-o','--output', default='schema.ndjson', help='Output NDJSON file') p.add_argument('--seed', type=int, default=None, help='Seed for reproducible randomness') p.add_argument('--logo_cdn', default=DESIRED_DEFAULT_FALLBACK_LOGO_URL, help='Default fallback logo URL') p.add_argument('--currency', default=DEFAULT_CURRENCY, help='Default currency') p.add_argument('-v','--verbose', action='store_true', help='Enable debug logging') p.add_argument('--full_time_hours', type=int, default=DEFAULT_FULL_TIME_HOURS_PER_WEEK, help="Standard hours for full-time salary conversions.") p.add_argument('--part_time_hours', type=int, default=DEFAULT_PART_TIME_HOURS_PER_WEEK, help="Standard hours for part-time salary conversions.") p.add_argument('--enable_salary_adjustment', action='store_true', help='Enable dynamic salary value adjustments (random increase/decrease/no change).') # --- Mutually exclusive group for title operations --- title_group = p.add_mutually_exclusive_group() title_group.add_argument('--no-change-in-title', action='store_true', help='Do not improve the job title; keep the original (still optimizes description, etc.).') title_group.add_argument('--slightly-improve-title', action='store_true', help='Apply only minor, geo-focused improvements to the original title and leave all other fields unchanged.') return p.parse_args() # ========================== # Small utilities # ========================== def title_case(s: str) -> str: if not s: return "" words = [w.capitalize() if not (w.isupper() and len(w) > 1) else w for w in s.split()] return " ".join(words) def to_dhaka_offset(dt: str) -> str: if not dt: return "" try: parsed_dt = datetime.fromisoformat(dt.replace('Z', '+00:00')) dhaka_tz = timezone(timedelta(hours=6)) return parsed_dt.astimezone(dhaka_tz).isoformat() except ValueError: logging.warning(f"Invalid date format for Dhaka offset: {dt}. Returning original.") return dt def to_midnight(dt_str: str) -> str: if not dt_str: return "" try: dt_obj = datetime.fromisoformat(dt_str.replace('Z', '+00:00')) target_tz = dt_obj.tzinfo or timezone(timedelta(hours=6)) dt_obj_midnight = dt_obj.astimezone(target_tz).replace(hour=23, minute=59, second=59, microsecond=0) return dt_obj_midnight.isoformat() except (IndexError, ValueError) as e: logging.warning(f"Date parse error for to_midnight: {dt_str}. Error: {e}. Returning original.") return dt_str def normalize_url(url: str) -> str: if not url: return "" url = url.split('?',1)[0].split('#',1)[0] if url.startswith('//'): url = 'https://' + url[2:] elif url.startswith('http://'): url = 'https://' + url[7:] elif not url.startswith('https://'): url = 'https://' + url.lstrip('/') if not re.search(r'\.\w{2,5}(?:/)?$', url.split('/')[-1]) and not url.endswith('/'): url += '/' return url def get_currency_symbol(currency_code: str) -> str: symbols = {"USD":"$", "EUR":"€", "GBP":"£", "JPY":"¥", "CAD":"CA$", "AUD":"A$", "INR":"₹", "BDT":"৳"} return symbols.get(str(currency_code).upper(), (str(currency_code) + " ") if currency_code else "$") # ========================== # Intelligence helpers # ========================== def get_primary_skill(skills_value) -> str: if not skills_value: return "" processed = "" if isinstance(skills_value, str): processed = skills_value elif isinstance(skills_value, list): string_skills = [] for item in skills_value: if isinstance(item, str) and item.strip(): string_skills.append(item.strip()) elif item is not None: try: s_item = str(item).strip() string_skills.append(s_item) except Exception: pass processed = ", ".join(string_skills) else: try: processed = str(skills_value).strip() except Exception: return "" if not processed: return "" try: return next((s.strip() for s in re.split(r'[,;/]', processed) if s.strip()), "") except TypeError: return "" def clean_text_to_list(text_content: str) -> list[str]: if not text_content: return [] soup = BeautifulSoup(text_content, 'html.parser') return [li.get_text(separator=' ', strip=True) for li in soup.find_all('li') if li.get_text(strip=True)] or \ [p.get_text(separator=' ', strip=True) for p in soup.find_all('p') if p.get_text(strip=True)] or \ ([soup.get_text(strip=True)] if soup.get_text(strip=True) else []) def get_location_details(rec: dict) -> tuple[str, str, str]: job_loc_data = rec.get('jobLocation') city, state, country = "", "", "US" if isinstance(job_loc_data, dict): address_data = job_loc_data.get('address') if isinstance(address_data, dict): city = address_data.get('addressLocality', '') state = address_data.get('addressRegion', '') country = address_data.get('addressCountry', country) if isinstance(address_data.get('addressCountry'), str) else country elif isinstance(job_loc_data, list) and job_loc_data: first_loc = job_loc_data[0] if isinstance(first_loc, dict): address_data = first_loc.get('address') if isinstance(address_data, dict): city = address_data.get('addressLocality', '') state = address_data.get('addressRegion', '') country = address_data.get('addressCountry', country) if isinstance(address_data.get('addressCountry'), str) else country if not city and state: city = state elif not city and not state: logging.debug(f"Job ID {rec.get('@id', 'Unknown')}: Missing addressLocality and addressRegion.") return str(city), str(state), str(country) def get_employment_types_info(rec: dict) -> dict: et_input = rec.get('employmentType') normalized_types = [] if isinstance(et_input, list): for item in et_input: if isinstance(item, str) and item.strip(): norm_item = item.replace('_', '-').strip().lower() normalized_types.append(title_case(norm_item.replace(" time", "-Time"))) elif isinstance(et_input, str) and et_input.strip(): norm_item = et_input.replace('_', '-').strip().lower() normalized_types.append(title_case(norm_item.replace(" time", "-Time"))) unique_types = sorted(list(set(normalized_types))) chosen_for_title, chosen_for_description = "Flexible", "Flexible" # --- Determine template_key --- template_key = "FLEXIBLE" if "Full-Time" in unique_types and len(unique_types) == 1: template_key = "FULL_TIME" elif "Part-Time" in unique_types and len(unique_types) == 1: template_key = "PART_TIME" if not unique_types: schema_list = ["OTHER"] else: is_full_time = "Full-Time" in unique_types is_part_time = "Part-Time" in unique_types if is_full_time and is_part_time: chosen_for_title, chosen_for_description = "Full/Part-Time", "Full-Time" elif len(unique_types) == 1: chosen_for_title, chosen_for_description = unique_types[0], unique_types[0] else: # Prioritize FT or PT if they exist, otherwise pick one chosen = "Full-Time" if is_full_time else ("Part-Time" if is_part_time else random.choice(unique_types)) chosen_for_title, chosen_for_description = chosen, chosen schema_list = [t.upper().replace('-', '_') for t in unique_types] return { 'all_available_display': unique_types or ["Flexible"], 'title_display': chosen_for_title, 'chosen_for_description': chosen_for_description, 'schema_list': schema_list, 'template_key': template_key } def get_industries_info(rec: dict) -> dict: industry_input = rec.get('industry') processed_industries = [] if isinstance(industry_input, list): for item in industry_input: if isinstance(item, str) and item.strip(): processed_industries.append(title_case(item.replace('&', 'and').strip())) elif isinstance(industry_input, str) and industry_input.strip(): processed_industries.append(title_case(industry_input.replace('&', 'and').strip())) unique_industries = sorted(list(set(processed_industries))) if not unique_industries: return {'display_list': [], 'title_display': "", 'schema_list': []} return { 'display_list': unique_industries, 'title_display': random.choice(unique_industries), 'schema_list': unique_industries } def get_experience_level_info(rec: dict) -> dict: exp_req = rec.get("experienceRequirements", {}) months_exp = None level_tag = "" description_tag = "" if isinstance(exp_req, dict): months_str = exp_req.get("monthsOfExperience") desc_str = exp_req.get("description", "").lower() if months_str is not None: try: months_exp = int(months_str) except (ValueError, TypeError): pass if months_exp is not None: for level, (min_m, max_m) in EXPERIENCE_LEVEL_MONTHS.items(): if min_m <= months_exp <= max_m: description_tag = level level_tag = level.split('-')[0] break else: for level, keywords in EXPERIENCE_LEVEL_KEYWORDS.items(): if any(kw in desc_str for kw in keywords): description_tag = level level_tag = level.split('-')[0] break if not level_tag and any(syn.lower() in desc_str for syn in ENTRY_SYNS): description_tag, level_tag = "Entry-Level", "Entry" return {"title_tag": level_tag, "description_tag": description_tag, "months": months_exp} def get_job_urgency_tags(date_posted_str: str, valid_through_str: str, rec_id: str) -> dict: urgency = {"title_tag": "", "description_key": None} now_utc = datetime.now(timezone.utc) if date_posted_str: try: posted_dt = datetime.fromisoformat(date_posted_str.replace('Z', '+00:00')).astimezone(timezone.utc) if (now_utc - posted_dt).days <= RECENT_POST_DAYS: urgency.update({"title_tag": random.choice(URGENCY_TAGS_NEW_TITLE), "description_key": "new"}) except ValueError as e: logging.debug(f"JID {rec_id}: Err parsing datePosted '{date_posted_str}': {e}") if valid_through_str: try: valid_dt = datetime.fromisoformat(valid_through_str.replace('Z', '+00:00')).astimezone(timezone.utc) if timedelta(days=0) <= (valid_dt - now_utc) <= timedelta(days=CLOSING_SOON_DAYS): urgency.update({"title_tag": random.choice(URGENCY_TAGS_CLOSING_TITLE), "description_key": "closing"}) except ValueError as e: logging.debug(f"JID {rec_id}: Err parsing validThrough '{valid_through_str}': {e}") return urgency def to_k_notation(num_val: float, currency_symbol: str) -> str: if abs(num_val) >= 1000: k_val = num_val / 1000.0 return f"{currency_symbol}{k_val:.1f}k".replace(".0k", "k") return f"{currency_symbol}{int(num_val)}" def format_salary_details(rec: dict, currency_symbol: str = "$", enable_dynamic_adjustment: bool = False, chosen_emp_type: str = "Full-Time", full_time_hours: int = DEFAULT_FULL_TIME_HOURS_PER_WEEK, part_time_hours: int = DEFAULT_PART_TIME_HOURS_PER_WEEK) -> dict: base = rec.get('baseSalary', {}) val_obj = base.get('value', {}) if not isinstance(base, dict): base = {} if not isinstance(val_obj, dict): val_obj = {} minv_raw = val_obj.get('minValue', base.get('minValue')) maxv_raw = val_obj.get('maxValue', base.get('maxValue')) unit_raw = str(val_obj.get('unitText', base.get('unitText', ''))).upper() primary_unit_normalized = NORMALIZED_SALARY_UNITS.get(unit_raw, "PROJECT") def parse_salary_value(s_val): if s_val is None: return None if isinstance(s_val, (int, float)): return float(s_val) if isinstance(s_val, str): s_val_cleaned = str(s_val).replace(currency_symbol, '').replace(',', '').strip() if "negotiable" in s_val_cleaned.lower() or not s_val_cleaned: return "Negotiable" try: return float(s_val_cleaned) except ValueError: return None return None min_val_num = parse_salary_value(minv_raw) max_val_num = parse_salary_value(maxv_raw) if min_val_num == "Negotiable" or max_val_num == "Negotiable": return {"primary_display": "Negotiable", "is_negotiable": True, "conversions": {}, "adjusted_factor": 1.0} if min_val_num is None and max_val_num is None: return {"primary_display": "", "is_negotiable": False, "conversions": {}, "adjusted_factor": 1.0} adjustment_factor = 1.0 if enable_dynamic_adjustment: adjustment_factor = random.choice(WEIGHTED_SALARY_ADJUSTMENT_FACTORS) if isinstance(min_val_num, (int, float)): min_val_num *= adjustment_factor if isinstance(max_val_num, (int, float)): max_val_num *= adjustment_factor if isinstance(min_val_num, (int, float)) and isinstance(max_val_num, (int, float)) and min_val_num > max_val_num: min_val_num, max_val_num = max_val_num, min_val_num primary_value_for_conversion, primary_display_val = None, "Error" if min_val_num is not None and max_val_num is not None: primary_value_for_conversion = (min_val_num + max_val_num) / 2.0 primary_display_val = f"{to_k_notation(min_val_num, currency_symbol)}-{to_k_notation(max_val_num, currency_symbol)}" elif max_val_num is not None: primary_value_for_conversion, primary_display_val = max_val_num, f"Up to {to_k_notation(max_val_num, currency_symbol)}" elif min_val_num is not None: primary_value_for_conversion, primary_display_val = min_val_num, to_k_notation(min_val_num, currency_symbol) else: return {"primary_display": "Negotiable", "is_negotiable": True, "conversions": {}, "adjusted_factor": adjustment_factor} primary_unit_display = SALARY_UNIT_MAP.get(primary_unit_normalized, "") primary_salary_str = f"{primary_display_val}{'/' + primary_unit_display if primary_unit_display and primary_unit_display != 'Project' else ''}" conversions, converted_values_num = {}, {} hours_per_week = part_time_hours if "part-time" in chosen_emp_type.lower() else full_time_hours if primary_value_for_conversion is not None and primary_unit_normalized != "PROJECT": annual_equiv = None if primary_unit_normalized == "HOUR": annual_equiv = primary_value_for_conversion * hours_per_week * WEEKS_PER_YEAR elif primary_unit_normalized == "WEEK": annual_equiv = primary_value_for_conversion * WEEKS_PER_YEAR elif primary_unit_normalized == "MONTH": annual_equiv = primary_value_for_conversion * MONTHS_PER_YEAR elif primary_unit_normalized == "YEAR": annual_equiv = primary_value_for_conversion if annual_equiv is not None: converted_values_num.update({ "YEAR": annual_equiv, "MONTH": annual_equiv / MONTHS_PER_YEAR, "WEEK": annual_equiv / WEEKS_PER_YEAR }) if hours_per_week > 0: converted_values_num["HOUR"] = (annual_equiv / WEEKS_PER_YEAR) / hours_per_week for unit, val in converted_values_num.items(): if unit != primary_unit_normalized: conversions[unit] = f"{to_k_notation(val, currency_symbol)}/{SALARY_UNIT_MAP.get(unit, '')}" return { "primary_display": primary_salary_str, "primary_raw_min": min_val_num, "primary_raw_max": max_val_num, "primary_unit_normalized": primary_unit_normalized, "is_negotiable": False, "conversions": conversions, "converted_raw": converted_values_num, "adjusted_factor": adjustment_factor } # ========================== # Content assembly & enrichment # (Refactored for Geo-Focus) # ========================== def clean_role_and_company(original_title: str, org_name_from_ho: str) -> tuple[str, str]: org_name = str(org_name_from_ho or "").strip() role = re.sub(r'\s*$.*?[mfvdix].*?$\s*', '', str(original_title), flags=re.IGNORECASE).strip() role = re.sub(r"\s+jobs?\b", "", role, flags=re.IGNORECASE).strip() company, final_role = org_name, role if not company: preps = ["at", "for", "with"] for prep in preps: match = re.search(rf"^(.*?)\s+{re.escape(prep)}\s+([\w\s.,'&()-]+)$", role, flags=re.IGNORECASE) if match and 2 <= len(match.group(2).split()) <= 5: company, final_role = match.group(2).strip(), match.group(1).strip() break if company: final_role = re.sub(rf"\s*\b{re.escape(company)}\b", "", final_role, flags=re.IGNORECASE).strip(" -|,") if not company: company = "A Leading Local Company" if not final_role: final_role = "Associate" return final_role.strip(), company.strip() def geo_context_enrichment(html_block: str, role: str, company: str, primary_skill: str, city: str, state: str, industry_display: str) -> str: """NEW: Minimal enrichment layer to add location-specific context. """ soup = BeautifulSoup(html_block or "", 'html.parser') text = soup.decode_contents() if html_block else "" # Controlled synonym map synonyms = { r"\bcompany\b": "organization", r"\bemployees\b": "team members", r"\bsalary\b": "compensation", r"\bjob\b": "role", r"\bexperience\b": "background", r"\bbenefits\b": "perks" } for pat, repl in synonyms.items(): text = re.sub(pat, repl, text, flags=re.IGNORECASE) # NEW: Geo-Context stitches stitches = [ f"

This {city}-based role is an excellent opportunity for professionals skilled in {primary_skill or 'relevant skills'}.

", f"

Our {company} team in {city}, {state} is growing.

", f"

Benefit from working in {city}, a key hub for the {industry_display} industry.

" ] # Insert stitches under first UL; else create a fresh list ul = soup.find('ul') if ul: for item in stitches: ul.append(BeautifulSoup(item, 'html.parser')) else: ul_new = BeautifulSoup("

", 'html.parser') for item in stitches: ul_new.ul.append(BeautifulSoup(item, 'html.parser')) soup.append(ul_new) return soup.decode_contents() # --- UPDATED FUNCTION --- def create_geo_targeted_summary(rec: dict, primary_skill: str, salary_details: dict, job_urgency: dict, exp_level_info: dict, industries_info: dict, emp_types_info: dict) -> str: """ NEW: Builds an SEO summary with a strong location-first emphasis. NOW: Uses Job-Type Themed templates. """ role_for_summary, company_for_summary = clean_role_and_company(rec.get('title',''), rec.get('hiringOrganization',{}).get('name','')) city, state, _ = get_location_details(rec) # --- NEW: Select template bank based on job type --- emp_type_key = emp_types_info.get('template_key', 'FLEXIBLE') tone_bank = GEO_TONE_TEMPLATES.get(emp_type_key, GEO_TONE_TEMPLATES['FLEXIBLE']) active = tone_bank['informational'] # --- END NEW --- salary_primary_display = salary_details.get("primary_display", "Negotiable") if salary_details else "Negotiable" converted_salary_suffix = "" if salary_details and salary_details.get("conversions"): best_conv = salary_details["conversions"].get("YEAR") or random.choice(list(salary_details["conversions"].values())) converted_salary_suffix = f" (approx. {best_conv})" data = { "role_for_summary": title_case(role_for_summary), "company_for_summary": company_for_summary, "city": city, "state": state, "local_area_info": random.choice(LOCAL_AREA_DESCRIPTORS).format(city=city), "salary_primary_display": salary_primary_display, "converted_salary_suffix": converted_salary_suffix, "primary_skill": primary_skill or "your professional skills", "industry_display": (industries_info['display_list'][0] if industries_info and industries_info.get('display_list') else "a dynamic") } def phrase(key): return random.choice(active[key]).format(**data) job_details = [phrase('intro'), phrase('company'), phrase('location')] urgency_key = job_urgency.get("description_key") if urgency_key: job_details.append(phrase(f'urgency_{urgency_key}')) exp_level = exp_level_info.get("description_tag") if exp_level == "Entry-Level": job_details.append(phrase('experience_entry')) elif exp_level == "Mid-Level": job_details.append(phrase('experience_mid')) elif exp_level == "Senior-Level": job_details.append(phrase('experience_senior')) comp_benefits = [phrase('salary'), f"Benefits: {random.choice(active['benefits'])}"] value_props = [f"{random.choice(active['value_props']).format(city=city)}"] role_focus = [random.choice(active['focus']).format(**data)] summary_html = f"

{random.choice(GEO_HEADER_TEMPLATES).format(city=city, state=state)}

" structure_choice = random.randint(1, 3) if structure_choice == 1: random.shuffle(job_details) summary_html += ( f"

{random.choice(SUB_HEADER_TEMPLATES['primary'])}

" ) summary_html += ( f"

{random.choice(SUB_HEADER_TEMPLATES['secondary'])}

" ) summary_html += ( f"

{random.choice(SUB_HEADER_TEMPLATES['tertiary'])}

" ) elif structure_choice == 2: all_parts = job_details + comp_benefits + value_props + role_focus random.shuffle(all_parts) summary_html += ( f"

{random.choice(SUB_HEADER_TEMPLATES['primary'])}

" ) else: summary_html += f"

{' '.join(job_details)}

" summary_html += f"

{' '.join(comp_benefits + value_props)}

" summary_html += f"

{random.choice(SUB_HEADER_TEMPLATES['tertiary'])}

{role_focus[0]}

" # Enrich with stitched context and synonyms summary_html = geo_context_enrichment( summary_html, role=title_case(role_for_summary), company=company_for_summary, primary_skill=primary_skill, city=city, state=state, industry_display=data["industry_display"] ) return BeautifulSoup(summary_html, 'html.parser').decode_contents().strip() # --- END UPDATED FUNCTION --- # ========================== # Title building & SEO post-processing # ========================== def enforce_length(title: str, max_len: int) -> str: if len(title) <= max_len: return title.strip() shortened = re.sub(r'\s*$[^)]*$\s*$', '', title).strip() if len(shortened) <= max_len: return shortened while len(title) > max_len: parts = title.rsplit(' ', 1) if len(parts) > 1: title = parts[0] else: return title[:max_len-3].strip() + "..." return title.strip(" -|,( ") def enrich_title_for_seo(title: str) -> str: """Append one missing, high-intent keyword if space allows; avoid stuffing.""" t = title for kw in JOB_SEO_KEYWORDS: if kw.lower() not in t.lower() and len(t) <= 60: # keep margin t = f"{t} | {kw.title()}" break return t # --- UPDATED FUNCTION --- def generate_location_focused_title(rec: dict, primary_skill: str, salary_details: dict, job_urgency:dict, exp_level_info:dict, emp_types_info:dict, industries_info:dict, dynamic_max_len:int) -> str: """ NEW: Generates SEO title using a prioritized, job-type themed selection. """ ho_name = rec.get('hiringOrganization', {}).get('name', '') cleaned_role, company_name = clean_role_and_company(rec.get('title', rec.get('name','')), ho_name) city, state, _ = get_location_details(rec) # Note: Salary/Company not used directly in new, short templates, but parsed anyway. salary_fmt = "" if salary_details and salary_details.get("primary_display") and not salary_details.get("is_negotiable"): salary_fmt = salary_details["primary_display"] prim_unit = salary_details["primary_unit_normalized"] conv = salary_details.get("conversions", {}) if prim_unit == "HOUR" and "YEAR" in conv: salary_fmt = f"{salary_details['primary_display'].split('/')[0]}/hr" parts = { "role": title_case(cleaned_role), "city": title_case(city), "state": state.upper(), "job_type": emp_types_info.get('title_display', "Flexible"), "hiring": random.choice(HIRING_SYNS), "urgency_tag": job_urgency.get('title_tag', ''), "experience_tag": exp_level_info.get('title_tag', ''), "company": title_case(company_name), "salary_compact": salary_fmt, "skill1": title_case(primary_skill), "industry": industries_info.get('title_display', "") } # --- 1. Determine Template Bank --- emp_type_key = emp_types_info.get('template_key', 'FLEXIBLE') template_bank = GEO_FOCUSED_TITLE_TEMPLATES.get(emp_type_key, GEO_FOCUSED_TITLE_TEMPLATES['FLEXIBLE']) # --- 2. Determine Priority Queue --- # Check if necessary data for secondary priority tags is available has_urgency = bool(parts['urgency_tag']) has_experience = bool(parts['experience_tag']) priority_queue = [] # Add secondary templates if we have the data if has_urgency or has_experience: priority_queue.extend(template_bank.get('SECONDARY', [])) # Always add primary geo-focused templates and fallbacks priority_queue.extend(template_bank.get('PRIMARY', [])) priority_queue.extend(template_bank.get('FALLBACK', [])) # Shuffle the entire queue to introduce variety random.shuffle(priority_queue) # --- 3. Iterate and Select --- selected_template = None for tmpl in priority_queue: # Check if ALL variables required by the template are present in 'parts' required_parts = re.findall(r'\{([^{}]+)\}', tmpl) # Filter templates that require components we don't have if not all(parts.get(p) for p in required_parts): continue # Try formatting and enforcing length try: temp_title = tmpl.format(**parts) # Clean up formatting issues temp_title = re.sub(r'\s{2,}', ' ', temp_title).strip() temp_title = re.sub(r'\s*([-|(),:•—])\s*', r'\1', temp_title) temp_title = temp_title.replace('()', '').replace('[]', '').strip(" -|,: •—") temp_title = re.sub(r'\s*-\s*-\s*', '-', temp_title) final_title = enforce_length(temp_title, dynamic_max_len) if final_title != "...": selected_template = final_title break # Found a title, exit the loop except KeyError: logging.debug(f"Template {tmpl} failed due to missing required key.") continue if selected_template: return selected_template # Final fallback if prioritized generation fails return enforce_length(title_case(cleaned_role) + " Job in " + city, dynamic_max_len) # --- END UPDATED FUNCTION --- def generate_slight_title_improvement(original_title: str, city: str, state: str, urgency_tag: str, max_len: int) -> str: """ Applies a minor, additive improvement to the original job title, as requested for the --slightly-improve-title mode. Uses a dynamic, *prioritized* template list for more intelligent variety. (This function is NOT themed by job type, as it's a minimal-change operation) """ title = original_title.strip() # 1. More gentle cleanup: just remove trailing separators cleaned_title = title.strip(" -|,: •—") title_lower = cleaned_title.lower() # 2. Check what information is *already* present has_city = city and city.lower() in title_lower # Check for state abbreviation, avoiding spaces (e.g., "NY" in "JobNY") has_state = state and state.upper() in re.sub(r'[^A-Z]', '', title) has_urgency = urgency_tag and urgency_tag.lower() in title_lower # 3. Define prioritized template buckets # {title} is the cleaned original title template_buckets = { 'urgent_geo': [ "{urgency_tag}: {title} ({city})", "{title} - {city}, {state} ({urgency_tag})", "{title} ({city}) - {urgency_tag}", ], 'geo': [ "{title} - {city}, {state}", "{title} | {city}, {state}", "{title} in {city}", "{city} Opening: {title}", "{title} ({city})", ], 'urgent': [ "{title} ({urgency_tag})", "{title} - {urgency_tag}", "{urgency_tag}: {title}", "{title} | {urgency_tag}", ], 'state_only': [ "{title} ({state} Opening)", "{title} - {state}", ] } parts = { "title": cleaned_title, "city": title_case(city), "state": state.upper(), "urgency_tag": urgency_tag, } # 4. Define which buckets to try, in order of priority priority_order = [] # Build priority list based on *new* information we can add can_add_geo = city and not has_city can_add_urgency = urgency_tag and not has_urgency # Only use state-only if city isn't available/used can_add_state = state and not has_state and not can_add_geo if can_add_urgency and can_add_geo: priority_order.append('urgent_geo') if can_add_geo: priority_order.append('geo') if can_add_urgency: priority_order.append('urgent') if can_add_state: priority_order.append('state_only') # 5. Iterate through prioritized buckets for bucket_name in priority_order: templates = template_buckets[bucket_name] random.shuffle(templates) # Shuffle templates *within* the priority bucket for tmpl in templates: # Check if all placeholders for *this* template are available placeholders = re.findall(r'\{([^{}]+)\}', tmpl) if not all(parts.get(p) for p in placeholders if p != 'title'): # 'title' is always present continue # Skip template if data is missing (e.g., missing {state} for a {city}, {state} template) new_title = tmpl.format(**parts) # Clean up potential formatting issues new_title = re.sub(r'\s{2,}', ' ', new_title).strip() new_title = re.sub(r'\s*([-|(),:•—])\s*', r'\1', new_title) new_title = new_title.replace('()', '').replace('[]', '').strip(" -|,: •—") if len(new_title) <= max_len: return enforce_length(new_title, max_len) # Found a good one that fits # 6. If no template was applied, return the cleaned original title return enforce_length(cleaned_title, max_len) # ========================== # Description assembler # ========================== # --- UPDATED FUNCTION --- def assemble_location_focused_description(html_str: str, rec: dict, primary_skill: str = "", salary_details:dict = None, job_urgency:dict=None, exp_level_info:dict=None, industries_info:dict=None, emp_types_info:dict=None) -> str: # <-- NEW ARG """ NEW: Assembles description using the geo-targeted summary function. NOW: Passes emp_types_info to the summary function for theming. """ original_html_content = html_str or "" original_was_empty = not original_html_content.strip() or original_html_content.strip().lower() in ["

no description provided.

"] # Use the new geo-targeted summary function seo_summary_html = create_geo_targeted_summary( rec, primary_skill, salary_details, job_urgency, exp_level_info, industries_info, emp_types_info # <-- Pass arg ) final_description = seo_summary_html if original_was_empty else seo_summary_html + "

" + original_html_content if CONTACT_CTA: final_description += f"

{CONTACT_CTA}

" return final_description # --- END UPDATED FUNCTION --- # ========================== # Hiring org normalization # ========================== def normalize_hiring_org(org: dict, logo_cdn: str) -> dict: if not isinstance(org, dict): return {} sa = org.get('sameAs', '') if sa and isinstance(sa, str) and not sa.startswith(('http://', 'https://')): org['sameAs'] = 'https://' + sa.lstrip('/') if not (isinstance(org.get('logo'), str) and org['logo'].startswith(('http://', 'https://'))): org['logo'] = logo_cdn return org # ========================== # Core pipeline # ========================== ARGS = None def rewrite_geotargeted_job_records(in_path: str, out_path: str, seed: int=None, logo_cdn: str = DESIRED_DEFAULT_FALLBACK_LOGO_URL, default_currency_arg: str = DEFAULT_CURRENCY, enable_salary_adj_arg: bool = False, full_time_hours_arg: int = DEFAULT_FULL_TIME_HOURS_PER_WEEK, part_time_hours_arg: int = DEFAULT_PART_TIME_HOURS_PER_WEEK, no_change_in_title_arg: bool = False, slightly_improve_title_arg: bool = False): logging.info(f"Optimizing Geo-Targeted Jobs: {in_path} -> {out_path} with seed {seed}") if no_change_in_title_arg: logging.info("Running with --no-change-in-title. Original titles will be preserved.") if slightly_improve_title_arg: logging.info("Running with --slightly-improve-title. ONLY titles will be modified; all other fields preserved.") # Adaptive stats (Contextual Behavior Simulation) stats = {"total": 0, "title_len_sum": 0, "desc_len_sum": 0} try: with open(in_path, 'r', encoding='utf-8') as fin_check: num_lines = sum(1 for line in fin_check if line.strip()) if num_lines == 0: logging.warning(f"Input file '{in_path}' is empty.") open(out_path, 'w').close() return with open(in_path, 'r', encoding='utf-8') as fin, open(out_path, 'w', encoding='utf-8') as fout: for line in tqdm(fin, total=num_lines, desc="Processing Geo-Targeted Jobs"): if not line.strip(): continue try: rec = json.loads(line) except json.JSONDecodeError as e: logging.warning(f"Bad JSON: {e}. Line: {line[:70]}...") continue if not isinstance(rec, dict): logging.warning("Line is not a JSON object, skipping.") continue jid = rec.get('@id') or rec.get('url') or hashlib.sha256(line.encode()).hexdigest() random.seed(hash(str(jid) + str(seed))) # --- NEW LOGIC BRANCH for --slightly-improve-title (Needs update to use new functions) --- if slightly_improve_title_arg: original_title = rec.get('title', rec.get('name', '')) if not original_title: fout.write(line) continue city, state, _ = get_location_details(rec) job_urgency = get_job_urgency_tags(rec.get('datePosted'), rec.get('validThrough'), jid) dynamic_max_len = MAX_TITLE_LEN_BASE if stats["total"] >= max(10, num_lines // 2): avg_title_len = (stats["title_len_sum"] / max(stats["total"], 1)) if avg_title_len < 52: dynamic_max_len = min(84, MAX_TITLE_LEN_BASE + 10) elif avg_title_len > 72: dynamic_max_len = max(60, MAX_TITLE_LEN_BASE - 5) new_title = generate_slight_title_improvement( original_title, city, state, job_urgency.get('title_tag', ''), dynamic_max_len ) rec['title'] = new_title stats["total"] += 1 stats["title_len_sum"] += len(new_title) fout.write(json.dumps(rec, ensure_ascii=False, sort_keys=True) + "\n") continue # --- END OF NEW LOGIC BRANCH --- # --- FULL PROCESSING LOGIC (only runs if --slightly-improve-title is OFF) --- # Extract & Normalize primary_skill = get_primary_skill(rec.get('skills','')) emp_types_info = get_employment_types_info(rec) # <-- Contains the new 'template_key' industries_info = get_industries_info(rec) exp_level_info = get_experience_level_info(rec) if rec.get('datePosted'): rec['datePosted'] = to_dhaka_offset(rec['datePosted']) if rec.get('validThrough'): rec['validThrough'] = to_midnight(rec['validThrough']) job_urgency = get_job_urgency_tags(rec.get('datePosted'), rec.get('validThrough'), jid) curr_code = rec.get('baseSalary', {}).get('currency', default_currency_arg) or default_currency_arg curr_symbol = get_currency_symbol(curr_code) salary_details = format_salary_details( rec, curr_symbol, enable_salary_adj_arg, emp_types_info['chosen_for_description'], full_time_hours_arg, part_time_hours_arg ) # --- UPDATED CALL --- # Description (prepend geo-targeted semantic SEO summary) rec['description'] = assemble_location_focused_description( rec.get('description',''), rec, primary_skill, salary_details, job_urgency, exp_level_info, industries_info, emp_types_info # <-- Pass new arg for theming ) # --- END UPDATED CALL --- # Title (with adaptive max length) dynamic_max_len = MAX_TITLE_LEN_BASE # Light adaptive tuning after half the dataset processed if stats["total"] >= max(10, num_lines // 2): avg_title_len = (stats["title_len_sum"] / max(stats["total"], 1)) if avg_title_len < 52: dynamic_max_len = min(84, MAX_TITLE_LEN_BASE + 10) elif avg_title_len > 72: dynamic_max_len = max(60, MAX_TITLE_LEN_BASE - 5) if no_change_in_title_arg: logging.debug(f"JID {jid}: Keeping original title.") else: # This call now automatically uses the themed templates rec['title'] = generate_location_focused_title( rec, primary_skill, salary_details, job_urgency, exp_level_info, emp_types_info, industries_info, dynamic_max_len ) # Employment & Industry for schema rec['employmentType'] = emp_types_info['schema_list'] or None rec['industry'] = industries_info['schema_list'] or None # URL normalization or generation if rec.get('url'): rec['url'] = normalize_url(rec['url']) else: # Canonical URL fallback from title slug = re.sub(r'[^a-z0-9]+', '-', rec['title'].lower()).strip('-') if rec.get('title') else hashlib.sha1(jid.encode()).hexdigest()[:10] rec['url'] = normalize_url(f"https://bgcareers.us.com/jobs/{slug}/") # Hiring Organization ho = rec.get('hiringOrganization') if isinstance(ho, dict): rec['hiringOrganization'] = normalize_hiring_org(ho, logo_cdn) elif isinstance(ho, str) and ho.strip(): rec['hiringOrganization'] = normalize_hiring_org({"@type":"Organization", "name":ho.strip()}, logo_cdn) else: _, cname = clean_role_and_company(rec.get('title',''), None) rec['hiringOrganization'] = normalize_hiring_org({"@type":"Organization", "name":cname}, logo_cdn) # Base salary cleanup bs = rec.get('baseSalary', {}) if isinstance(bs, dict): bs.setdefault('@type','MonetaryAmount') bs['currency'] = curr_code v = bs.get('value', {}) if not isinstance(v, dict): v = {} v.setdefault('@type','QuantitativeValue') if salary_details.get("is_negotiable"): v.update({'description': "Negotiable", 'minValue': None, 'maxValue': None, 'unitText': None}) elif salary_details.get("primary_raw_min") is not None or salary_details.get("primary_raw_max") is not None: v.update({ 'minValue': str(salary_details.get("primary_raw_min")), 'maxValue': str(salary_details.get("primary_raw_max")), 'unitText': salary_details.get("primary_unit_normalized", "PROJECT").upper(), }) bs['value'] = v rec['baseSalary'] = bs # Remove null description field in baseSalary.value if present if rec.get('baseSalary', {}).get('value', {}).get('description') is None: try: del rec['baseSalary']['value']['description'] except KeyError: pass # Schema baseline rec.update({'@context':'http://schema.org', '@type':'JobPosting'}) # Clean empty fields for k in list(rec.keys()): if rec[k] is None: del rec[k] # Stats tracking stats["total"] += 1 stats["title_len_sum"] += len(rec.get('title', '')) stats["desc_len_sum"] += len(BeautifulSoup(rec.get('description',''), 'html.parser').get_text()) fout.write(json.dumps(rec, ensure_ascii=False, sort_keys=True) + "\n") # After run, log adaptive insight if stats["total"]: avg_t = stats["title_len_sum"] / stats["total"] avg_d = stats["desc_len_sum"] / stats["total"] logging.info(f"Adaptive tuning summary: avg title length = {avg_t:.1f}, avg description length = {avg_d:.1f} chars, n={stats['total']}") except FileNotFoundError: logging.error(f"Input file '{in_path}' not found.") except Exception as e: logging.error(f"An unexpected error occurred: {e}", exc_info=True) logging.info(f"Processing complete. Output: {out_path}") # ========================== # Entrypoint # ========================== def main(): global ARGS ARGS = parse_args() log_level = logging.DEBUG if ARGS.verbose else logging.INFO logging.basicConfig(level=log_level, format="%(asctime)s [%(levelname)s] %(filename)s:%(lineno)d - %(message)s", datefmt="%Y-%m-%d %H:%M:%S") # Call the new main processing function rewrite_geotargeted_job_records( ARGS.input, ARGS.output, ARGS.seed, ARGS.logo_cdn, ARGS.currency, ARGS.enable_salary_adjustment, ARGS.full_time_hours, ARGS.part_time_hours, ARGS.no_change_in_title, ARGS.slightly_improve_title ) if __name__ == "__main__": main()