Skip to content

Commit

Permalink
move spark code to its own project
Browse files Browse the repository at this point in the history
  • Loading branch information
jelmerk committed Dec 30, 2023
1 parent 0f71351 commit 6db6c06
Show file tree
Hide file tree
Showing 65 changed files with 29 additions and 6,038 deletions.
25 changes: 7 additions & 18 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
name: CI pipeline

permissions:
checks: write

on:
pull_request:
paths:
Expand All @@ -9,38 +12,24 @@ on:
- '*'
tags-ignore:
- 'v[0-9]+.[0-9]+.[0-9]+'
paths-ignore:
- '**.md'

jobs:
ci-pipeline:
runs-on: ubuntu-22.04
strategy:
fail-fast: false
matrix:
spark:
- 2.4.8
- 3.0.2
- 3.1.3
- 3.2.4
- 3.3.2
- 3.4.1
- 3.5.0

env:
ENV: 'ci'
SPARK_VERSION: ${{ matrix.spark }}

steps:
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- uses: actions/setup-python@v5
with:
python-version: |
3.7
3.9
- name: Build and test
run: |
sbt -java-home "$JAVA_HOME_17_X64" clean +test -DsparkVersion="$SPARK_VERSION"
sbt -java-home "$JAVA_HOME_17_X64" clean +test
- name: Publish Unit test results
uses: mikepenz/action-junit-report@v4
with:
Expand Down
23 changes: 5 additions & 18 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
name: Publish pipeline

permissions:
contents: read

on:
workflow_dispatch:

Expand All @@ -10,21 +13,9 @@ on:
jobs:
publish-artifacts:
runs-on: ubuntu-22.04
strategy:
fail-fast: false
matrix:
spark:
- 2.4.8
- 3.0.2
- 3.1.3
- 3.2.4
- 3.3.2
- 3.4.1
- 3.5.0

env:
ENV: 'ci'
SPARK_VERSION: ${{ matrix.spark }}
NEXUS_USER: ${{ secrets.NEXUS_USER }}
NEXUS_PASSWORD: ${{ secrets.NEXUS_PASSWORD }}

Expand All @@ -33,16 +24,12 @@ jobs:
uses: actions/checkout@v3
with:
fetch-depth: 0
- uses: actions/setup-python@v5
with:
python-version: |
3.7
3.9
- name: Import GPG Key
uses: crazy-max/ghaction-import-gpg@v1
env:
GPG_PRIVATE_KEY: ${{ secrets.GPG_PRIVATE_KEY }}
PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
- name: Publish artifacts
run: |
sbt -java-home "$JAVA_HOME_17_X64" clean +publishSigned -DsparkVersion="$SPARK_VERSION"
sbt -java-home "$JAVA_HOME_17_X64" clean +publishSigned
sbt -java-home "$JAVA_HOME_17_X64" sonatypeBundleRelease
5 changes: 5 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
name: Release pipeline

permissions:
contents: write

on:
workflow_dispatch:
inputs:
Expand All @@ -13,6 +16,8 @@ jobs:
steps:
- name: Checkout main branch
uses: actions/checkout@v3
with:
token: ${{ secrets.RELEASE_TOKEN }}
- name: Release
run: |
git config --global user.email "[email protected]"
Expand Down
9 changes: 0 additions & 9 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,3 @@ dist/

# MacOS
.DS_Store

# Virtual env
.venv

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
.pytest_cache/
16 changes: 0 additions & 16 deletions .run/Template ScalaTest.run.xml

This file was deleted.

4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ The following distance metrics are currently pre-packaged :
- inner product
- manhattan distance

It comes with [spark integration](https://github.com/jelmerk/hnswlib/tree/master/hnswlib-spark), [pyspark integration](https://github.com/jelmerk/hnswlib/tree/master/hnswlib-pyspark) and a [scala wrapper](https://github.com/jelmerk/hnswlib/tree/master/hnswlib-scala) that should feel native to scala developers
It comes with a [scala wrapper](https://github.com/jelmerk/hnswlib/tree/master/hnswlib-scala) that should feel native to scala developers

Apache spark support was moved into the [hnswlib-spark](https://github.com/jelmerk/hnswlib-spark) project.

To find out more about how to use this library take a look at the [hnswlib-examples](https://github.com/jelmerk/hnswlib/tree/master/hnswlib-examples) module or browse the documentation
in the readme files of the submodules
Expand Down
81 changes: 6 additions & 75 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
import Path.relativeTo
import sys.process.*

ThisBuild / organization := "com.github.jelmerk"
ThisBuild / scalaVersion := "2.12.18"

Expand All @@ -15,15 +12,8 @@ ThisBuild / Compile / doc / javacOptions ++= {
Seq("-Xdoclint:none")
}

val java8Home = sys.env.getOrElse("JAVA_HOME_8_X64", s"${sys.props("user.home")}/.sdkman/candidates/java/8.0.382-amzn")

lazy val publishSettings = Seq(
pomIncludeRepository := { _ => false },
publishTo := {
val nexus = "https://oss.sonatype.org/"
if (isSnapshot.value) Some("snapshots" at nexus + "content/repositories/snapshots")
else Some("releases" at nexus + "service/local/staging/deploy/maven2")
},

licenses := Seq("Apache License 2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html")),

Expand All @@ -43,7 +33,11 @@ lazy val publishSettings = Seq(
"oss.sonatype.org",
sys.env.getOrElse("NEXUS_USER", ""),
sys.env.getOrElse("NEXUS_PASSWORD", "")
)
),

publishTo := sonatypePublishToBundle.value,
sonatypeSessionName := s"[sbt-sonatype] ${name.value} ${version.value}"

)

lazy val noPublishSettings =
Expand All @@ -54,13 +48,8 @@ val junitVersion = "5.5.2"
val hamcrestVersion = "2.1"
val mockitoVersion = "3.0.0"

val sparkVersion = settingKey[String]("Spark version")

lazy val pyTest = taskKey[Unit]("Run the python tests")
lazy val pyPublish = taskKey[Unit]("Publish the python sources to a pypi repo")

lazy val root = (project in file("."))
.aggregate(hnswlibUtils, hnswlibCore, hnswlibCoreJdk17, hnswlibMetricsDropwizard, hnswlibScala, hnswlibSpark)
.aggregate(hnswlibUtils, hnswlibCore, hnswlibCoreJdk17, hnswlibMetricsDropwizard, hnswlibScala)
.settings(noPublishSettings)

lazy val hnswlibUtils = (project in file("hnswlib-utils"))
Expand Down Expand Up @@ -168,62 +157,4 @@ lazy val hnswlibScala = (project in file("hnswlib-scala"))
libraryDependencies ++= Seq(
"org.scalatest" %% "scalatest" % scalaTestVersion % Test
)
)

lazy val hnswlibSpark = (project in file("hnswlib-spark"))
.dependsOn(hnswlibUtils)
.dependsOn(hnswlibScala)
.settings(
name := s"hnswlib-spark_${sparkVersion.value.split('.').take(2).mkString("-")}",
publishSettings,
crossScalaVersions := {
if (sparkVersion.value >= "3.2.0") {
Seq("2.12.18", "2.13.10")
} else if (sparkVersion.value >= "3.0.0") {
Seq("2.12.18")
} else {
Seq("2.12.18", "2.11.12")
}
},
javaHome := Some(file(java8Home)),
Compile / unmanagedSourceDirectories += baseDirectory.value / "src" / "main" / "python",
Test / unmanagedSourceDirectories += baseDirectory.value / "src" / "test" / "python",
Compile / packageBin / mappings ++= {
val base = baseDirectory.value / "src" / "main" / "python"
val srcs = base ** "*.py"
srcs pair relativeTo(base)
},
assembly / mainClass := None,
assembly / assemblyOption ~= {
_.withIncludeScala(false)
},
sparkVersion := sys.props.getOrElse("sparkVersion", "3.3.2"),
pyTest := {
val log = streams.value.log

val artifactPath = (Compile / assembly).value.getAbsolutePath
if (scalaVersion.value == "2.12.18" && sparkVersion.value >= "3.0.0" || scalaVersion.value == "2.11.12") {
val pythonVersion = if (scalaVersion.value == "2.11.12") "python3.7" else "python3.9"
val ret = Process(
Seq("./run-pyspark-tests.sh", sparkVersion.value, pythonVersion),
cwd = baseDirectory.value,
extraEnv = "JAVA_HOME" -> java8Home, "ARTIFACT_PATH" -> artifactPath
).!
require(ret == 0, "Python tests failed")
} else {
// pyspark packages support just one version of scala. You cannot use 2.13.x because it ships with 2.12.x jars
log.info(s"Running pyTests for Scala ${scalaVersion.value} and Spark ${sparkVersion.value} is not supported.")
}
},
test := {
(Test / test).value
(Test / pyTest).value
},
pyTest := pyTest.dependsOn(assembly).value,
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-hive" % sparkVersion.value % Provided,
"org.apache.spark" %% "spark-mllib" % sparkVersion.value % Provided,
"com.holdenkarau" %% "spark-testing-base" % s"${sparkVersion.value}_1.4.7" % Test,
"org.scalatest" %% "scalatest" % scalaTestVersion % Test
)
)
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.github.jelmerk.knn.util;

import java.io.Serial;
import java.io.Serializable;
import java.util.Arrays;

Expand Down
Loading

0 comments on commit 6db6c06

Please sign in to comment.