Skip to content

Commit

Permalink
Implement resume parsing :sweet:
Browse files Browse the repository at this point in the history
  • Loading branch information
vaibhavpandeyvpz committed Jun 23, 2024
1 parent dd984ef commit e055a64
Show file tree
Hide file tree
Showing 9 changed files with 329 additions and 11 deletions.
49 changes: 49 additions & 0 deletions .github/workflows/cd.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
name: CD

on:
push:
branches: [main]

env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}

permissions:
packages: write

jobs:
build:
name: Build Docker image
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install cosign
uses: sigstore/[email protected]
with:
cosign-release: v2.1.1
- name: Setup Docker buildx
uses: docker/[email protected]
- name: Log into registry ${{ env.REGISTRY }}
uses: docker/[email protected]
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract Docker metadata
id: meta
uses: docker/[email protected]
with:
images: "${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}"
tags: type=raw,value=latest
- name: Build & push Docker image
uses: docker/[email protected]
with:
context: .
platforms: linux/amd64,linux/arm64
push: "true"
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max=
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@

Dockerized microservice to extract information from PDF resumes of infinite kind and transform it into uniform JSON structure to be used for further automation.

## Usage
## Development

Run below commands in project folder:

```shell
# create app config
cp config.dist.ini config.ini

# update values in config.ini
# update values in config.ini e.g., openai.* ones

# start the services
docker compose up -d
Expand Down
7 changes: 6 additions & 1 deletion app/container.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from dependency_injector.containers import DeclarativeContainer, WiringConfiguration
from dependency_injector.providers import Configuration
from dependency_injector.providers import Configuration, Singleton
from openai import OpenAI
from os import path


Expand All @@ -13,3 +14,7 @@ class Container(DeclarativeContainer):
wiring_config = WiringConfiguration(
modules=[".routers.index"],
)

openai = Singleton(
OpenAI, api_key=config.openai.api_key, organization=config.openai.organization
)
66 changes: 64 additions & 2 deletions app/routers/index.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from dependency_injector.wiring import Provide, inject
from fastapi import APIRouter, Depends, status
from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
import json
from openai import OpenAI
from pydantic import BaseModel
from typing import Any, Dict
from pypdf import PdfReader
from typing import Annotated, Any, Dict

from app.container import Container
from app.schema import Resume

index_router = APIRouter()

Expand All @@ -18,3 +22,61 @@ async def index(
config: Dict[str, Any] = Depends(Provide[Container.config]),
) -> IndexResponse:
return IndexResponse(env=config["app"]["env"])


@index_router.post(
"/process", status_code=status.HTTP_200_OK, response_model=Dict[str, Any]
)
@inject
async def process(
resume: Annotated[UploadFile, File(description="Resume as a PDF file.")],
openai: OpenAI = Depends(Provide[Container.openai]),
config: Dict[str, Any] = Depends(Provide[Container.config]),
) -> Dict[str, Any]:
if resume.content_type != "application/pdf":
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Uploaded file is not a PDF file.",
)

# get the PDF content
pdf = PdfReader(resume.file)
num_pages = len(pdf.pages)
text = "\n".join(pdf.pages[page].extract_text() for page in range(num_pages))

# parse as JSON format
completion = openai.chat.completions.create(
messages=[
{
"role": "system",
"content": "You are a bot that parses information from resume's text content into structured JSON.",
},
{
"role": "user",
"content": "Following is the text content extracted from an uploaded PDF resume."
"Parse it into JSON format.",
},
{
"role": "user",
"content": text,
},
],
model=config["openai"]["model"],
tools=[
{
"type": "function",
"function": {
"name": "format_resume",
"description": "Restructure data extracted from resume to the defined JSON schema.",
"parameters": Resume,
},
}
],
tool_choice={"type": "function", "function": {"name": "format_resume"}},
)

choice = completion.choices[0]
tool_call = choice.message.tool_calls[0]
output = json.loads(tool_call.function.arguments)

return output
119 changes: 119 additions & 0 deletions app/schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
Resume = {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "Full name of the candidate.",
},
"emails": {
"type": "array",
"description": "List of email addresses of the candidate.",
"items": {"type": "string"},
},
"phones": {
"type": "array",
"description": "List of phone numbers of the candidate, in E.164 format.",
"items": {"type": "string"},
},
"date_of_birth": {
"type": "string",
"description": "Date of birth of the candidate in YYYY-MM-DD format.",
},
"nationality": {
"type": "string",
"description": "Nationality of the candidate as ISO 3166-1 alpha-2 code.",
},
"about": {
"type": "string",
"description": "Text content from about section of the candidate.",
},
"skills": {
"type": "array",
"description": "List of skills mentioned by the candidate.",
"items": {"type": "string"},
},
"hobbies": {
"type": "array",
"description": "List of hobbies mentioned by the candidate.",
"items": {"type": "string"},
},
"languages": {
"type": "array",
"description": "List of languages known by the candidate as ISO 639 codes.",
"items": {"type": "string"},
},
"links": {
"type": "array",
"description": "List of URLs or links (website, social media etc.) mentioned by the candidate.",
"items": {"type": "string"},
},
"qualifications": {
"type": "array",
"description": "List of educational qualifications mentioned by the candidate sorted in descending order.",
"items": {
"type": "object",
"properties": {
"degree": {
"type": "string",
"description": "Name of the degree which was achieved.",
},
"institution": {
"type": "string",
"description": "Name of the institution where this educational qualification was achieved.",
},
"started_at": {
"type": "string",
"description": "Start date of pursuing this educational qualification in YYYY-MM format.",
},
"ended_at": {
"type": "string",
"description": "End date of pursuing this educational qualification in YYYY-MM format."
"Exclude if not mentioned of currently pursuing.",
},
},
"required": ["degree", "institution"],
},
},
"experiences": {
"type": "array",
"description": "List of work experiences mentioned by the candidate sorted in descending order.",
"items": {
"type": "object",
"properties": {
"company": {
"type": "string",
"description": "Name of the company or organization.",
},
"designation": {
"type": "string",
"description": "Name of the role or designation at this company.",
},
"location": {
"type": "string",
"description": "Location of the company.",
},
"description": {
"type": "string",
"description": "Job description, responsibilities or project details while working here.",
},
"started_at": {
"type": "string",
"description": "Joining date at this company in in YYYY-MM format.",
},
"exited_at": {
"type": "string",
"description": "Leaving or exit date at this company in YYYY-MM format."
"Exclude if not mentioned of currently working.",
},
"skills": {
"type": "array",
"description": "List of skills learned or utilised while working here.",
"items": {"type": "string"},
},
},
"required": ["company", "designation", "from"],
},
},
},
"required": ["name"],
}
4 changes: 2 additions & 2 deletions app/web.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware

from .container import Container
from .routers import index_router
from app.container import Container
from app.routers import index_router

app = FastAPI()

Expand Down
5 changes: 5 additions & 0 deletions config.dist.ini
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
[app]
env=dev

[openai]
api_key=
organization=
model=gpt-3.5-turbo
Loading

0 comments on commit e055a64

Please sign in to comment.