Package testBuff;
Import the Java. IO. UnsupportedEncodingException;
Import the Java. Util. HashMap;
Import the Java. Util. Iterator;
import java.util.Map;
Public class CosineSimilarAlgorithm {
Public static double getSimilarity (String doc1, String doc2) {
If (doc1!=null & amp; & Doc1. The trim (). The length () & gt; 0 & amp; & Doc2!=null
& & Doc2. The trim (). The length () & gt; 0 {
Map
//will be the Chinese characters in two strings and the total number of encapsulation, AlgorithmMap in
for (int i=0; i
If (isHanZi (d1)) {
Int charIndex=getGB2312Id (d1);
If (charIndex!=1) {
Int [] fq=AlgorithmMap. Get (charIndex);
If (fq!=null & amp; & Fq. Length==2) {
Fq [0] + +;
} else {
Fq=new int [2].
Fq [0]=1;
Fq [1]=0;
AlgorithmMap. Put (charIndex, fq);
}
}
}
}
for (int i=0; i
If (isHanZi (d2)) {
Int charIndex=getGB2312Id (d2);
If (charIndex!=1) {
Int [] fq=AlgorithmMap. Get (charIndex);
If (fq!=null & amp; & Fq. Length==2) {
Fq [1] + +;
} else {
Fq=new int [2].
Fq [0]=0;
Fq [1]=1;
AlgorithmMap. Put (charIndex, fq);
}
}
}
}
Iterator
Double sqdoc1=0;
Double sqdoc2=0;
Double denominator=0;
While (iterator. HasNext ()) {
Int [] c=AlgorithmMap. Get (iterator. The next ());
Denominator + c=c [0] * [1];
Sqdoc1 + c=c [0] * [0];
Sqdoc2 + [1] * c=c [1];
}
Return the denominator/Math. SQRT (sqdoc1 * sqdoc2);
} else {
Throw new NullPointerException (
"The Document is null or have not cahrs!! ");
}
}
Public static Boolean isHanZi (char ch) {
//determine whether Chinese characters
Return (ch & gt;=0 x4e00 & amp; & Ch & lt;=0 x9fa5);
}
/* *
* according to enter Unicode characters, obtain its GB2312 encoding or ASCII code,
*
* @ param ch
* enter the GB2312 character or ASCII characters in Chinese (128)
* @ return ch position in GB2312, 1 indicates that the character don't know
*/
Public static short getGB2312Id (char ch) {
Try {
Byte [] buffer=Character. ToString (ch). GetBytes (" GB2312 ");
If (buffer length!=2) {
//buffer normally should be two bytes, otherwise that ch does not belong to GB2312 encoding, so return '? 'and don't know at this point that the characters
return -1;
}
Int b0=(int) (buffer [0] & amp; 0 x0ff) - 161;//code from A1, so minus 0 xa1=161
Int b1=(int) (buffer [1] & amp; 0 x0ff) - 161;//the first character and the last one character at a time without Chinese characters, so that each area only accept 16 * 6-2=94 characters
Return (short) (b0 * 94 + b1);
} the catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return -1;
}
}
CodePudding user response:
The original poster wants to what kind of function? Perhaps the vb has a more concise code...CodePudding user response:
Is really a word frequency statistics, then calculates similarity,Online, you what all don't do lazy, hanging a few days again,
Maybe he is solved,