Skip to content

Commit

Permalink
feat(plugins): Add support to plugins. Move 'conditions' previously a…
Browse files Browse the repository at this point in the history
…dded as a core feature to plugins architecture.
  • Loading branch information
obetomuniz committed Mar 27, 2023
1 parent a9a1c30 commit 5dcf4d3
Show file tree
Hide file tree
Showing 13 changed files with 212 additions and 65 deletions.
32 changes: 28 additions & 4 deletions lib/engines/html/html.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,43 @@ const processData = async (
url: string,
options: IScrapeHtmlOptions
): TScrapedDataPromise => {
const { selectors, request } = options
const { selectors, request, plugins } = options

let html = await fetchHttp(url, request as AxiosRequestConfig)

plugins?.forEach((plugin) => {
if (plugin.preProcess) {
html = plugin.preProcess(html)
}
})

const html = await fetchHttp(url, request as AxiosRequestConfig)
const dom = new JSDOM(html)
const document = dom.window.document
return extractData(document, selectors)
}

const scrapeHtml = async (
url: string,
{ selectors, request }: IScrapeHtmlOptions
{ selectors, request, plugins }: IScrapeHtmlOptions
): TScrapedDataPromise => {
const data = await processData(url, { selectors, request })
plugins?.forEach((plugin) => {
if (
plugin.pluginType === "transformer" &&
plugin.initialize &&
plugin.supportedEngines?.includes("html")
) {
plugin.initialize({ selectors })
}
})

let data = await processData(url, { selectors, request, plugins })

plugins?.forEach((plugin) => {
if (plugin.postProcess) {
data = plugin.postProcess(data)
}
})

return data
}

Expand Down
39 changes: 32 additions & 7 deletions lib/engines/json/json.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,44 @@ import {
import fetchHttp from "../../utils/request/http"
import extractData from "../../utils/extract/json"

const processData = (
j: string,
{ selectors }: IScrapeXmlOptions
): TScrapedData => {
const processData = async (
url: string,
{ selectors, request, plugins }: IScrapeXmlOptions
): TScrapedDataPromise => {
let j = await fetchHttp(url, request)

plugins?.forEach((plugin) => {
if (plugin.preProcess) {
j = plugin.preProcess(j)
}
})

return extractData(j, selectors)
}

const scrapeJson = async (
url: string,
{ selectors, request }: IScrapeXmlOptions
{ selectors, request, plugins }: IScrapeXmlOptions
): TScrapedDataPromise => {
const j = await fetchHttp(url, request as AxiosRequestConfig)
return processData(j, { selectors })
plugins?.forEach((plugin) => {
if (
plugin.pluginType === "transformer" &&
plugin.initialize &&
plugin?.supportedEngines?.includes("json")
) {
plugin.initialize({ selectors })
}
})

let data = await processData(url, { selectors, request, plugins })

plugins?.forEach((plugin) => {
if (plugin.postProcess) {
data = plugin.postProcess(data)
}
})

return data
}

export default scrapeJson
34 changes: 29 additions & 5 deletions lib/engines/spa/spa.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,43 @@ const processData = async (
url: string,
options: IScrapeSpaOptions
): TScrapedDataPromise => {
const { selectors, request } = options
const { selectors, request, plugins } = options

const htmlContent = await fetchSpa(url, request as LaunchOptions)
const dom = new JSDOM(htmlContent)
let html = await fetchSpa(url, request as LaunchOptions)

plugins?.forEach((plugin) => {
if (plugin.preProcess) {
html = plugin.preProcess(html)
}
})

const dom = new JSDOM(html)
const document = dom.window.document
return extractData(document, selectors)
}

const scrapeSpa = async (
url: string,
{ selectors, request }: IScrapeSpaOptions
{ selectors, request, plugins }: IScrapeSpaOptions
): TScrapedDataPromise => {
const data = await processData(url, { selectors, request })
plugins?.forEach((plugin) => {
if (
plugin.pluginType === "transformer" &&
plugin.initialize &&
plugin?.supportedEngines?.includes("spa")
) {
plugin.initialize({ selectors })
}
})

let data = await processData(url, { selectors, request, plugins })

plugins?.forEach((plugin) => {
if (plugin.postProcess) {
data = plugin.postProcess(data)
}
})

return data
}

Expand Down
33 changes: 28 additions & 5 deletions lib/engines/xml/xml.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,43 @@ import fetchHttp from "../../utils/request/http"
import extractData from "../../utils/extract/xml"

const processData = async (
xml: string,
{ selectors }: IScrapeXmlOptions
url: string,
{ selectors, request, plugins }: IScrapeXmlOptions
): TScrapedDataPromise => {
let xml = await fetchHttp(url, request)

plugins?.forEach((plugin) => {
if (plugin.preProcess) {
xml = plugin.preProcess(xml)
}
})

const dom = new JSDOM(xml, { contentType: "text/xml" })
const document = dom.window.document
return extractData(document, selectors)
}

const scrapeXml = async (
url: string,
{ selectors, request }: IScrapeXmlOptions
{ selectors, request, plugins }: IScrapeXmlOptions
): TScrapedDataPromise => {
const xml = await fetchHttp(url, request as AxiosRequestConfig)
const data = await processData(xml, { selectors })
plugins?.forEach((plugin) => {
if (
plugin.pluginType === "transformer" &&
plugin.initialize &&
plugin?.supportedEngines?.includes("xml")
) {
plugin.initialize({ selectors })
}
})

let data = await processData(url, { selectors, request, plugins })

plugins?.forEach((plugin) => {
if (plugin.postProcess) {
data = plugin.postProcess(data)
}
})

return data
}
Expand Down
1 change: 1 addition & 0 deletions lib/index.ts
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
export * from "./types"
export * from "./engines"
export * from "./plugins"
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { OperatorFn, Operators } from "../../types"
import { OperatorFn, Operators } from "./types"

const operators: Operators = {
difference: (a: string, b: string, sensitive: boolean) =>
Expand All @@ -12,7 +12,7 @@ const operators: Operators = {
}

export const applyConditions = (value: string, conditions: any[]): boolean => {
return conditions.some((condition) => {
return conditions.every((condition) => {
const operation: OperatorFn | undefined = operators[condition.operation]
return operation
? operation(value, condition.value, condition.sensitive || false)
Expand Down
31 changes: 31 additions & 0 deletions lib/plugins/conditions/plugin.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import { ITransformerPlugin, TScrapedData } from "../../types"
import { ConditionPluginType } from "./types"
import { applyConditions } from "./conditions"

const plugin = (conditions: ConditionPluginType): ITransformerPlugin => ({
supportedEngines: ["html", "json", "xml", "spa"],

pluginType: "transformer",

postProcess(data: TScrapedData) {
const newData: TScrapedData = {}

for (const [key, value] of Object.entries(data)) {
if (conditions[key]) {
if (Array.isArray(value)) {
newData[key] = value.filter((v) =>
applyConditions(v, conditions[key])
)
} else {
newData[key] = applyConditions(value, conditions[key]) ? value : null
}
} else {
newData[key] = value
}
}

return newData
},
})

export default plugin
26 changes: 26 additions & 0 deletions lib/plugins/conditions/types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
export type OperatorFn = (a: string, b: string, sensitive: boolean) => boolean

export type Operators = {
[key: string]: OperatorFn
}

export enum ConditionOperationType {
Difference = "difference",
Equal = "equal",
Contains = "contains",
Regex = "regex",
}

export type ConditionOperationTypes =
| "difference"
| "equal"
| "contains"
| "regex"

export interface ConditionType {
value: string
operation: ConditionOperationType | ConditionOperationTypes
sensitive?: boolean
}

export interface ConditionPluginType extends Record<string, ConditionType[]> {}
2 changes: 2 additions & 0 deletions lib/plugins/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
export { default as conditionsPlugin } from "./conditions/plugin"
export { ConditionType, ConditionOperationType } from "./conditions/types"
58 changes: 33 additions & 25 deletions lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,63 +10,71 @@ export enum EngineType {

export type EngineTypes = "html" | "spa" | "json" | "xml"

export enum OperationType {
Difference = "difference",
Equal = "equal",
Contains = "contains",
Regex = "regex",
}

export type OperationTypes = "difference" | "equal" | "contains" | "regex"

export type OperatorFn = (a: string, b: string, sensitive: boolean) => boolean

export type Operators = {
[key: string]: OperatorFn
}

export type TScrapedData = Record<string, any>

export type TScrapedDataPromise = Promise<TScrapedData>

export interface Condition {
value: string
operation: EngineType | OperationTypes
sensitive?: boolean
}
export interface ISelector {
selector: string
attribute?: string
conditions?: Condition[]
}

export type TSelectors = Record<string, ISelector>

export interface IScrapeHtmlOptions {
selectors: TSelectors
request?: AxiosRequestConfig
plugins?: ITransformerPlugin[]
}

export interface IScrapeSpaOptions {
selectors: TSelectors
request?: LaunchOptions
plugins?: ITransformerPlugin[]
}

export interface IScrapeJsonOptions {
selectors: TSelectors
request?: AxiosRequestConfig
plugins?: ITransformerPlugin[]
}

export interface IScrapeXmlOptions {
selectors: TSelectors
request?: AxiosRequestConfig
plugins?: ITransformerPlugin[]
}

export type IScrapeDefaultOptions = IScrapeHtmlOptions &
IScrapeSpaOptions &
IScrapeJsonOptions &
IScrapeXmlOptions

export interface IScrapeOptions {
url: string
engine: EngineType | EngineTypes
options: IScrapeHtmlOptions &
IScrapeSpaOptions &
IScrapeJsonOptions &
IScrapeXmlOptions
options: IScrapeDefaultOptions
}

export enum PluginType {
Transformer = "transformer",
Engine = "engine",
}

export type PluginTypes = "transformer" | "engine"

export interface IPlugin {
pluginType: string
}
export interface ITransformerPlugin extends IPlugin {
meta?: any
supportedEngines?: string[]
preProcess?: (html: string) => string
postProcess?: (data: TScrapedData) => TScrapedData
initialize?: (options: { selectors: TSelectors }) => void
}

export interface IEnginePlugin extends IPlugin {
engine: string
scrape: (url: string, options: IScrapeDefaultOptions) => Promise<TScrapedData>
}
5 changes: 0 additions & 5 deletions lib/utils/extract/html.ts
Original file line number Diff line number Diff line change
@@ -1,15 +1,10 @@
import { TScrapedData, TSelectors } from "../../types"
import { applyConditions } from "../tools/condition"

const getSelectorValue = (element: Element, selector: any): string | null => {
const value = selector.attribute
? element.getAttribute(selector.attribute) || ""
: element.textContent?.trim() || ""

if (selector.conditions && !applyConditions(value, selector.conditions)) {
return null
}

return value
}

Expand Down
Loading

0 comments on commit 5dcf4d3

Please sign in to comment.