package docstract import ( "errors" "io/ioutil" "strings" ) //DocType is a wrapper for the type iota/enum type DocType int const ( //DocUnkown represents an unknown document type DocUnkown = iota //DocPDF represents a pdf document type DocPDF //DocX represents a microsoft docx document type DocX //DocXLSX represents microsoft excel doc DocXLSX //DocHTML represents an html document type DocHTML ) //DocStract stores the binary data for extracted files, as well as the type and filename metadata type DocStract struct { Type DocType FileName *string Bytes []byte } //SaveFile saves the file to the path, does not check if it's an unkown filetype only if it has a name func (d *DocStract) SaveFile(path string) error { if len(path) > 0 && path[len(path)-1] != '/' { path += "/" } if d.FileName == nil { return errors.New("document does not have a filename cannot save") } return ioutil.WriteFile(path+*(d.FileName), d.Bytes, 0644) } //sets name to nil if cannot dertermine name and type to unkown func (d *DocStract) getName() { blocks := strings.Split(string(d.Bytes), "\n") nameBlock := blocks[len(blocks)-1] chunks := strings.Split(nameBlock, ".") nameChunk := 0 t := DocUnkown switch len(chunks[0]) { case 0: //pdf t = DocPDF nameChunk = 2 default: //html switch { case strings.Contains(chunks[0], "word"): //docx nameChunk = 8 t = DocX break case strings.Contains(chunks[2], "worksheets"): //xlsx t = DocXLSX for i := 3; i < len(chunks); i++ { if strings.Contains(StripSeperators(chunks[i]), "xlsx") { nameChunk = i + 1 break } } break default: //html t = DocHTML } } name := strings.TrimSpace(chunks[nameChunk]) name = StripSeperators(name) switch t { case DocPDF: name += ".pdf" name = name[3:] d.Type = DocPDF case DocX: name += ".docx" name = name[3:] d.Type = DocX case DocXLSX: name += ".xlsx" head := 0 for i, b := range d.Bytes { if i+1 < len(d.Bytes) && b == 'P' && d.Bytes[i+1] == 'K' { head = i break } } name = name[3:] d.Type = DocXLSX d.Bytes = d.Bytes[head:] case DocHTML: name += ".html" d.Type = DocHTML } d.FileName = &name }